In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('IMDb Movies India.csv', encoding='ISO-8859-1')

df = pd.read_csv('IMDb Movies India.csv', encoding='latin1')

# Load the dataset

print(df.head())

# Display summary statistics
print(df.describe())

# Check for missing values
print(df.isnull().sum())

# Visualize the distribution of ratings
sns.histplot(df['Rating'], bins=20, kde=True)
plt.title('Movie Ratings Distribution')
plt.show()


# Remove "min" from the 'Duration' and convert to numeric
df['Duration'] = df['Duration'].str.replace(' min', '', regex=False)
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')  # Convert to numeric
sns.pairplot(df[['Rating', 'Duration']])
plt.show()



In [100]:
# Convert 'Votes' to numeric, coercing invalid values to NaN
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

# Fill missing values in 'Votes' with the median of valid numeric entries
df['Votes'] = df['Votes'].fillna(df['Votes'].median())

# Convert 'Duration' to string to ensure we can use .str accessor
df['Duration'] = df['Duration'].astype(str)

# Cleaning 'Duration' by removing ' min' and converting it to numeric
df['Duration'] = df['Duration'].str.replace(' min', '').astype(float)

# Fill missing values in 'Duration' with the median of valid numeric entries
df['Duration'] = df['Duration'].fillna(df['Duration'].median())




In [101]:
# Convert 'Votes' to numeric, handling any errors by coercing invalid data to NaN
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

# Extract and convert 'Year' from the string (assuming it contains years)
df['Year'] = df['Year'].str.extract(r'(\d{4})').astype(float)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting the distribution of ratings
sns.histplot(df['Rating'], bins=20, kde=True)
plt.title('Distribution of Movie Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Scatter plot to see the relationship between movie 'Duration' and 'Rating'
sns.scatterplot(data=df, x='Duration', y='Rating')
plt.title('Rating vs Duration')
plt.xlabel('Duration (minutes)')
plt.ylabel('Rating')
plt.show()


In [104]:
# Apply one-hot encoding to the 'Genre' column
df = pd.get_dummies(df, columns=['Genre'], drop_first=True)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('IMDb Movies India.csv', encoding='latin1')

# Check for non-numeric columns
print("Data types before conversion:")
print(df.dtypes)

# Attempt to convert relevant columns to numeric (this includes 'Duration', 'Votes', 'Rating', 'Year', 'Movie_Age')
cols_to_convert = ['Duration', 'Votes', 'Rating', 'Year']  # Add any other relevant columns here

for col in cols_to_convert:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Check for any remaining non-numeric values (should be NaN now)
print("Data types after conversion:")
print(df.dtypes)

# Drop rows with NaN values in numeric columns if necessary
df.dropna(subset=cols_to_convert, inplace=True)

# Calculate and plot the correlation matrix
correlation_matrix = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.show()


In [None]:
sns.histplot(df['Votes'], bins=20, kde=True)
plt.title('Distribution of Votes')
plt.xlabel('Votes')
plt.ylabel('Frequency')
plt.show()

sns.countplot(x='Year', data=df)
plt.title('Number of Movies Released by Year')
plt.xticks(rotation=45)
plt.show()


In [107]:
# Create a new feature for the age of the movie
current_year = 2024  # or use datetime.now().year
df['Movie_Age'] = current_year - df['Year']


In [None]:
# Scatter plot of Movie Age vs Rating
sns.scatterplot(data=df, x='Movie_Age', y='Rating')
plt.title('Movie Age vs Rating')
plt.xlabel('Movie Age (Years)')
plt.ylabel('Rating')
plt.show()


In [None]:
# Calculate and plot the correlation matrix
correlation_matrix = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Descriptive statistics for Movie Age
print(df['Movie_Age'].describe())


In [None]:
# Calculate average rating by year
avg_rating_by_year = df.groupby('Year')['Rating'].mean().reset_index()

# Plot average rating over the years
plt.figure(figsize=(12, 6))
sns.lineplot(data=avg_rating_by_year, x='Year', y='Rating')
plt.title('Average Movie Rating Over the Years')
plt.xlabel('Year')
plt.ylabel('Average Rating')
plt.xticks(rotation=45)
plt.show()


In [None]:
# First, ensure that you have a 'Genre' column in a suitable format
df_genres = df[['Genre', 'Rating']]
df_genres['Present'] = 1  # Create a presence column for one-hot encoding

# Split the genres into separate rows
df_genres_melted = df_genres.explode('Genre')
avg_rating_per_genre = df_genres_melted.groupby('Genre')['Rating'].mean().reset_index()

# Plotting average rating by genre
plt.figure(figsize=(12, 6))
sns.barplot(data=avg_rating_per_genre, x='Rating', y='Genre')
plt.title('Average Movie Rating by Genre')
plt.xlabel('Average Rating')
plt.ylabel('Genre')
plt.show()


In [None]:
# Scatter plot to see the relationship between Votes and Ratings
plt.figure(figsize=(12, 6))
sns.scatterplot(data=df, x='Votes', y='Rating')
plt.title('Votes vs. Rating')
plt.xlabel('Votes')
plt.ylabel('Rating')
plt.xscale('log')  # Log scale for better visualization
plt.show()


In [None]:
# Count the number of movies released each year
movies_per_year = df['Year'].value_counts().sort_index()

plt.figure(figsize=(12, 6))
sns.lineplot(x=movies_per_year.index, y=movies_per_year.values)
plt.title('Number of Movies Released Per Year')
plt.xlabel('Year')
plt.ylabel('Number of Movies')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Average rating by director
avg_rating_by_director = df.groupby('Director')['Rating'].mean().reset_index()
avg_rating_by_director = avg_rating_by_director.sort_values(by='Rating', ascending=False).head(10)  # Top 10 directors

plt.figure(figsize=(12, 6))
sns.barplot(data=avg_rating_by_director, x='Rating', y='Director')
plt.title('Top 10 Directors by Average Rating')
plt.xlabel('Average Rating')
plt.ylabel('Director')
plt.show()


In [None]:
# Calculate and plot the correlation matrix
correlation_matrix = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.show()
