In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load the dataset
data = pd.read_csv("../input/videogamesales/vgsales.csv")

# Drop rows with missing values after 2015
data = data[data["Year"] <= 2015]

# Explore the dataset
genre_counts = data["Genre"].value_counts()
top_genres = genre_counts.index[:10]

In [None]:
# Plot the count of games by genre
plt.figure(figsize=(12, 6))
sns.countplot(x="Genre", data=data, order=top_genres, palette="viridis")
plt.xticks(rotation=45)
plt.xlabel("Genre")
plt.ylabel("Count")
plt.title("Number of Games by Genre")
plt.show()

In [None]:
# Plot the count of games by year
plt.figure(figsize=(12, 6))
sns.countplot(x="Year", data=data, palette="viridis")
plt.xticks(rotation=45)
plt.xlabel("Year")
plt.ylabel("Count")
plt.title("Number of Games by Year")
plt.show()

In [None]:
# Determine the top genres released each year
year_max_genre = data.groupby(["Year", "Genre"]).size().reset_index(name="Count")
year_max_genre = year_max_genre.loc[year_max_genre.groupby("Year")["Count"].idxmax()]

In [None]:
# Plot the top genres released each year
plt.figure(figsize=(12, 6))
sns.barplot(x="Year", y="Count", hue="Genre", data=year_max_genre, palette="viridis")
plt.xticks(rotation=45)
plt.xlabel("Year")
plt.ylabel("Count")
plt.title("Top Genres Released Each Year")
plt.legend(title="Genre", loc="upper right")
plt.show()

In [None]:
# Visualize global sales by year
yearly_sales = data.groupby("Year")["Global_Sales"].sum().reset_index()
plt.figure(figsize=(12, 6))
sns.barplot(x="Year", y="Global_Sales", data=yearly_sales, palette="viridis")
plt.xticks(rotation=45)
plt.xlabel("Year")
plt.ylabel("Global Sales")
plt.title("Global Sales by Year")
plt.show()

In [None]:
# Load the dataset
data = pd.read_csv("../input/videogamesales/vgsales.csv")

# Drop rows with missing values after 2015
data = data[data["Year"] <= 2015]

# Preprocessing: Select relevant features and handle missing values
features = ['Year', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']
target = 'Global_Sales'
data = data[features + [target]].dropna()

# Split the data into features (X) and target variable (y)
X = data[features]
y = data[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Compare predicted and actual values
predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(predictions.head())