In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("IMDb Movies India.csv",encoding="latin-1")

In [None]:
df

In [None]:
#Check for missing values in each column
print("/nMissing values in each column:")
df.isnull().sum()

In [None]:
print("\nCount of unique values in 'Name' column:")
print(df['Name'].value_counts())

In [None]:
print("\nCount of unique values in 'Year' column:")
print(df['Year'].value_counts())

In [None]:
print("\nCount of unique values in 'Duration' column:")
print(df['Duration'].value_counts())

In [None]:
print("\nCount of unique values in 'Genre' column:")
print(df['Genre'].value_counts())

In [None]:
print("\nCount of unique values in 'Rating' column:")
print(df['Rating'].value_counts())

In [None]:
print("\nCount of unique values in 'Votes' column:")
print(df['Votes'].value_counts())

In [None]:
print("\nCount of unique values in 'Director' column:")
print(df['Director'].value_counts())

In [None]:
print("\nCount of unique values in 'Actor 1' column:")
print(df['Actor 1'].value_counts())

In [None]:
print("\nCount of unique values in 'Actor 2' column:")
print(df['Actor 2'].value_counts())

In [None]:
print("\nCount of unique values in 'Actor 3' column:")
print(df['Actor 3'].value_counts())

In [None]:
print("\nDuplicate rows in the Dataframe:")
print(df.duplicated().sum())

In [None]:
df = df.drop_duplicates()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
# Compute and fill missing with median
median_year = df['Year'].median()
df['Year'] = df['Year'].fillna(median_year)


In [None]:

# Convert 'Duration' to numeric in case there are non-numeric values
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')

# Compute and fill missing with median
median_duration = df['Duration'].median()
df['Duration'] = df['Duration'].fillna(median_duration)


In [None]:
# Ensure 'Rating' is numeric
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Calculate and fill median
median_rating = df['Rating'].median()
df['Rating'] = df['Rating'].fillna(median_rating)


In [None]:
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')
median_Votes = df['Votes'].median()
df['Votes'] = df['Votes'].fillna(median_Votes)

In [None]:
df.drop(['Year','Duration','Votes'],axis=1,inplace=True,errors='ignore')

In [None]:
df

In [None]:
np.random.seed(0)#For reproducibility
data = np.random.normal(loc=0, scale=1, size=1000)#Generate random data with normal distribution
plt.figure(figsize=(6, 4))
plt.hist(data, bins=30, color='purple', edgecolor='black')#Histogram showing the distribution of data
plt.title('Name of the movies according to the ratings')#Histogram of data distribution
plt.xlabel('ratings')
plt.ylabel('Name of the movies')
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Fill NaN values with a placeholder string
df['Director'].fillna('Unknown', inplace=True)
df['Actor 1'].fillna('Unknown', inplace=True)

plt.figure(figsize=(6,4))
x=df["Director"].head(4)
y=df["Actor 1"].head(4)
plt.scatter(x,y,color='red',alpha=0.7)#Scatter plot showing relationship between x and y
plt.title("Name of Actor 1 according to the Director")
plt.xlabel("Director")
plt.ylabel("Actor1")
plt.grid(True)
plt.show()

In [None]:
# Count the number of movies directed by each director
director_counts = df['Director'].value_counts()

# Select the top 10 directors with the most movies directed
top_10_directors = director_counts.head(10)

# Create a bar plot to visualize the top 10 directors with the most movies directed
plt.figure(figsize=(12, 6))
plt.bar(top_10_directors.index, top_10_directors.values),
plt.title('Top 10 Directors with the Most Movies Directed')
plt.xlabel('Director')
plt.ylabel('Number of Movies Directed')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Count the number of movies in which each actor starred
actor_counts = df[['Actor 1', 'Actor 2', 'Actor 3']].stack().value_counts()

# Select the top 10 actors with the most movie appearances
top_10_actors = actor_counts.head(10)

# Create a bar plot to visualize the top 10 actors with the most movie appearances
plt.figure(figsize=(12, 6))
plt.bar(top_10_actors.index, top_10_actors.values,  color='skyblue')
plt.title('Top 10 Actors with the Most Movie Appearances')
plt.xlabel('Actor')
plt.ylabel('Number of Movie Appearances')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Group the data by director and find the maximum rating within each group
directors_max_rating = df.groupby('Director')['Rating'].max()

# Sort the directors by their highest rating in descending order and select the top 10
top_10_directors = directors_max_rating.sort_values(ascending=False).head(10)

# Create a bar plot to visualize the top 10 directors with the highest-rated movies
plt.figure(figsize=(12, 6))
plt.bar(top_10_directors.index, top_10_directors.values)
plt.title('Top 10 Directors with the Highest-Rated Movies')
plt.xlabel('Director')
plt.ylabel('Maximum Rating')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Group the data by genre and calculate the mean rating for each genre
genre_mean_ratings = df.groupby('Genre')['Rating'].mean()

# Sort the genres by mean rating in descending order and select the top 10
top_10_genres = genre_mean_ratings.sort_values(ascending=False).head(10)

# Create a bar plot to visualize the top 10 highly rated genres
plt.figure(figsize=(12, 6))
plt.bar(top_10_genres.index, top_10_genres.values, color='skyblue')
plt.title('Top 10 Highly Rated Movie Genres')
plt.xlabel('Genre')
plt.ylabel('Mean Rating')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Count the occurrences of each Director and Actor 1 combination
collaboration_counts = df.groupby(['Director', 'Actor 1']).size().reset_index(name='collaboration_count')

# Sort the collaborations by count in descending order
frequent_collaborations = collaboration_counts.sort_values(by='collaboration_count', ascending=False)

# Display the most frequent collaborations (you can adjust the number to display)
print("Most frequent Director and Actor 1 collaborations:")
display(frequent_collaborations.head(20)) # Display top 20 collaborations

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6,4))

# Select only numerical columns for the correlation heatmap
numerical_df = df.select_dtypes(include=np.number)

# Calculate the correlation matrix
corr_matrix = numerical_df.corr()

# Create the heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")

plt.title("Correlation Heatmap of Numerical Columns")
plt.show()

In [None]:
sns.set(style='whitegrid')

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df['Rating'],bins=20,kde=True, color='purple')#Histogram with KDE overlay
plt.title('Ratings of the movies according to the Duration')#Histogram with KDE showing Recived Amount distribution
plt.xlabel('Duration')
plt.ylabel('Ratings')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(6,4))
sns.scatterplot(x='Name',y='Rating',data=df, hue='Genre',palette='deep',alpha=0.7)
plt.title('Name Vs Rating')
plt.xlabel('Name')
plt.ylabel('Rating')
plt.show()

In [None]:
# Find the top N directors (you can adjust N as needed)
top_n = 15  # For example, top 15 directors
top_directors = df['Director'].value_counts().nlargest(top_n).index

# Filter the DataFrame to include only movies by the top directors
df_top_directors = df[df['Director'].isin(top_directors)].copy()

# Create the countplot for the top directors
plt.figure(figsize=(12, 8))
sns.countplot(data=df_top_directors, x='Director', hue='Genre')
plt.title(f'Genre Distribution for Top {top_n} Directors')
plt.xlabel('Director')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.ylim(0, 5) # Set y-axis limit to zoom in on lower counts
plt.show()

In [None]:
# Count the occurrences of each Director-Genre combination
director_genre_counts = df.groupby(['Director', 'Genre']).size().reset_index(name='count')

# Find the top N directors (you can adjust N as needed)
top_directors = df['Director'].value_counts().nlargest(15).index

# Filter the counts for the top directors
top_directors_genre_counts = director_genre_counts[director_genre_counts['Director'].isin(top_directors)]

# Pivot the data for the stacked bar chart
pivot_df = top_directors_genre_counts.pivot(index='Director', columns='Genre', values='count').fillna(0)

# Create the stacked bar chart
plt.figure(figsize=(12, 8))
pivot_df.plot(kind='bar', stacked=True, figsize=(12, 8))

plt.title('Genre Distribution for Top Directors')
plt.xlabel('Director')
plt.ylabel('Number of Movies')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.pairplot(df[["Name","Genre","Rating","Director","Actor 1","Actor 2","Actor 3"]],diag_kind="kde",hue="Genre")
plt.show()

In [None]:
# Select the numerical variables for the pair plot
numerical_columns = df.select_dtypes(include=['number'])

# Create a pair plot for the numerical variables
sns.pairplot(numerical_columns)
plt.show()


In [None]:
# Get the counts of each director
director_counts = df['Director'].value_counts()

# Select the top N directors (you can adjust N as needed)
top_n = 10  # For example, top 10 directors
top_directors_counts = director_counts.nlargest(top_n)

# Create the pie chart
plt.figure(figsize=(8, 8))
plt.pie(top_directors_counts, labels=top_directors_counts.index, autopct='%1.1f%%', startangle=140)
plt.title(f'Top {top_n} Directors Distribution')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
#Define predictor variable
X=df["Genre"]
#Define target variable
y=df["Rating"]

#Split data into training and testing sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

# One-hot encode the 'Genre' column
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Align columns so both have the same dummy columns
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Handle missing values in y_train by filling with the median
y_train = y_train.fillna(y_train.median())

simple_model=LinearRegression()
simple_model.fit(X_train,y_train)

# Handle missing values in y_test by filling with the median of y_train
y_test = y_test.fillna(y_train.median())

#Predictions
y_pred=simple_model.predict(X_test)

# Create a scatter plot for the test data
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.5, color='blue', label='Actual vs Predicted (Test Data)')
plt.title('Actual vs Predicted Values')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.legend()
plt.grid(True)

# Show the plot
plt.show()

In [None]:
# Handle missing values in y_train by filling with the median
y_train = y_train.fillna(y_train.median())

simple_model=LinearRegression()
simple_model.fit(X_train,y_train)

In [None]:
# Handle missing values in y_test by filling with the median of y_train
y_test = y_test.fillna(y_train.median())

#Predictions
y_pred=simple_model.predict(X_test)

In [None]:
#Evaluate the model
print(f'Mean Squared Error: {mean_squared_error(y_test ,y_pred)}')
print(f'R^2 Score:{r2_score(y_test,y_pred)}')

In [None]:
#Display the coefficients of the model
coefficients=pd.DataFrame(simple_model.coef_,X_train.columns,columns=['Coefficient'])
print(coefficients)