In [None]:
import pandas as pd

df = pd.read_csv("adventure_movies_dataset.csv")

print ("FIRST 10 ROWS OF DATA:\n")
df.head(10)

In [None]:
df.info()

In [19]:
#Check number of rows and columns
print ("Numbersof rows:", df.shape[0])
print ("Number of columns:", df.shape[1])

#Display column names
print ("Column Names:", df.columns)

Numbersof rows: 80000
Number of columns: 10
Column Names: Index(['Movie_ID', 'Title', 'Year', 'Genre', 'Director', 'Budget_Million',
       'BoxOffice_Million', 'Runtime_Minutes', 'Rating', 'Award_Won'],
      dtype='object')


In [20]:
missing_movieid = df["Movie_ID"].isnull().sum()
missing_title = df["Title"].isnull().sum()
missing_year = df["Year"].isnull().sum()
missing_genre = df["Genre"].isnull().sum()
missing_director = df["Director"].isnull().sum()
missing_budget = df["Budget_Million"].isnull().sum()
missing_boxoffice = df["BoxOffice_Million"].isnull().sum()
missing_runtime = df["Runtime_Minutes"].isnull().sum()
missing_rating = df["Rating"].isnull().sum()
missing_award = df["Award_Won"].isnull().sum()


print ("Missing values in Movie ID:", missing_movieid)
print ("Missing values in Title:", missing_title)
print ("Missing values in Year:", missing_year)
print ("Missing values in Genre:", missing_genre)
print ("Missing values in Director:", missing_director)
print ("Missing values in Budget_Million:", missing_budget)
print ("Missing values in BoxOffice_Million:", missing_boxoffice)
print ("Missing values in Runtime:", missing_runtime)
print ("Missing values in Rating:", missing_rating)
print ("Missing values in Award_Won:", missing_award)

Missing values in Movie ID: 0
Missing values in Title: 0
Missing values in Year: 0
Missing values in Genre: 0
Missing values in Director: 0
Missing values in Budget_Million: 0
Missing values in BoxOffice_Million: 0
Missing values in Runtime: 0
Missing values in Rating: 0
Missing values in Award_Won: 0


In [22]:
print (df["Genre"].value_counts())

Genre
Adventure    56110
Sci-Fi        8021
Fantasy       7951
Action        7918
Name: count, dtype: int64


In [23]:
print ("Average Box Office Revenue(Million USD):",
        df["BoxOffice_Million"].mean())

Average Box Office Revenue(Million USD): 200.40606034174928


In [37]:
max_budget = df["Budget_Million"].max()
print ("Maximum Budget (Million USD):", max_budget)
print (df[df["Budget_Million"] == max_budget])

Maximum Budget (Million USD): 289.10939244983206
        Movie_ID                 Title  Year      Genre    Director  \
36373  MOV036374  Adventure Film 36374  2020  Adventure  Director A   

       Budget_Million  BoxOffice_Million  Runtime_Minutes  Rating Award_Won  
36373      289.109392         243.903081        116.06643     1.3        No  


In [28]:
awardcount = df["Award_Won"].value_counts()
print ("Movies with Awards:\n")
print (awardcount)

Movies with Awards:

Award_Won
No     56148
Yes    23852
Name: count, dtype: int64


In [32]:
print ("Median Runtime (Minutes):", df["Runtime_Minutes"].median())

Median Runtime (Minutes): 126.12850455408491


In [35]:
correlation = df["Budget_Million"].corr(df["BoxOffice_Million"])
print ("Correlation b/w Budget and Box Office Revenue:", correlation)

Correlation b/w Budget and Box Office Revenue: 0.0004596886358127435


In [40]:
top_directors = df["Director"].value_counts().head(5)
print ("Top 5 Director with the most movies directed:\n", top_directors)

Top 5 Director with the most movies directed:
 Director
Director E    16051
Director B    16050
Director D    16007
Director C    15992
Director A    15900
Name: count, dtype: int64


In [44]:
yearlyrevenue = df.groupby("Year")["BoxOffice_Million"].mean()
highestyear = yearlyrevenue.idxmax()
highestrevenue = yearlyrevenue.max()

print ("Year with the Highest Average Box Office Revenue:\n", highestyear)
print ("Highest Average Revenue (Million USD):\n", highestrevenue)

Year with the Highest Average Box Office Revenue:
 2012
Highest Average Revenue (Million USD):
 205.1806124573583


In [46]:
genrepercentage = df["Genre"].value_counts(normalize=True) * 100
print ("Percentage of Movies by Genre:\n", genrepercentage)

Percentage of Movies by Genre:
 Genre
Adventure    70.13750
Sci-Fi       10.02625
Fantasy       9.93875
Action        9.89750
Name: proportion, dtype: float64


In [6]:
df.shape

(80000, 10)

In [7]:
#total rows before removing duplicates
rows_beforeduplicates = df.shape[0]
duplicates = df.duplicated().sum()
#remove duplicates
df = df.drop_duplicates()

#total rows after removing duplicates
rows_afterduplicates = df.shape[0]

print ("Rows before removing duplicates:", rows_beforeduplicates)
print ("Rows after removing duplicates:", rows_afterduplicates)
print ("Number of Duplicate Rows:", duplicates)
print (df.shape)

Rows before removing duplicates: 80000
Rows after removing duplicates: 80000
Number of Duplicate Rows: 0
(80000, 10)


In [8]:
print ("                                 Descriptive statistics:\n")
df.describe()

                                 Descriptive statistics:



Unnamed: 0,Year,Budget_Million,BoxOffice_Million,Runtime_Minutes,Rating
count,80000.0,80000.0,80000.0,80000.0,80000.0
mean,2001.509425,100.703075,200.40606,126.957796,5.498771
std,12.660702,48.357263,97.593747,14.465309,2.593778
min,1980.0,10.0,5.0,80.0,1.0
25%,1991.0,66.28695,132.262816,116.96781,3.3
50%,2002.0,100.072418,199.069221,126.128505,5.5
75%,2012.0,133.697708,266.389786,136.303363,7.7
max,2023.0,289.109392,621.936634,180.0,10.0


In [9]:
#Correlation Matrix

#seelcting numerical columns only
numerical_df = df.select_dtypes(include=['float64', 'int64'])

#calculate correlation matrix
correlation_matrix = numerical_df.corr()

print ("Correlation matrix:")
correlation_matrix

Correlation matrix:


Unnamed: 0,Year,Budget_Million,BoxOffice_Million,Runtime_Minutes,Rating
Year,1.0,0.002123,-0.000878,-0.004212,0.000684
Budget_Million,0.002123,1.0,0.00046,0.003103,0.00286
BoxOffice_Million,-0.000878,0.00046,1.0,-0.002474,-0.004357
Runtime_Minutes,-0.004212,0.003103,-0.002474,1.0,-0.007358
Rating,0.000684,0.00286,-0.004357,-0.007358,1.0


In [10]:
from sklearn.model_selection import train_test_split

#select features and target
x = df[["Year", "Budget_Million", "BoxOffice_Million", "Runtime_Minutes"]]

y = df["Award_Won"]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(x, y,
                                                    test_size=0.2, random_state=42)

In [12]:
print ("Shapes of training and test data sets:\n")
print (X_train.shape, X_test.shape, y_train.shape, y_test.shape)

Shapes of training and test data sets:

(64000, 4) (16000, 4) (64000,) (16000,)


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
rf_model = RandomForestClassifier (random_state=42) 
rf_model.fit(X_train, y_train)

In [14]:
y_pred_rf = rf_model.predict(X_test)
print(y_pred_rf)

['No' 'Yes' 'No' ... 'No' 'No' 'No']


In [15]:
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:")
print(accuracy_rf)

Random Forest Accuracy:
0.6824375


In [16]:
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Classification Report:
              precision    recall  f1-score   support

          No       0.70      0.96      0.81     11222
         Yes       0.28      0.04      0.07      4778

    accuracy                           0.68     16000
   macro avg       0.49      0.50      0.44     16000
weighted avg       0.57      0.68      0.59     16000



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# Preprocessing the dataset
# Encode the target variable (Award_Won: Yes -> 1, No -> 0)
label_encoder = LabelEncoder()
df['Award_Won'] = label_encoder.fit_transform(df['Award_Won'])

# Select features and target
X = df[['Budget_Million', 'BoxOffice_Million', 'Runtime_Minutes', 'Rating']]  # Numerical features
y = df['Award_Won']  # Target variable

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create and train the SVM model
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print (accuracy)
print (report)

In [42]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()


# Select features and target for regression
X = df[['Budget_Million', 'Runtime_Minutes', 'Rating']]  # Example features
y = df['BoxOffice_Million']  # Target variable

# Standardize the features
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create and train the SVR model
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)  # RBF kernel
svr_model.fit(X_train, y_train)

# Make predictions
y_pred = svr_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print (mse)
print (r2)


9505.759527145092
-0.0012868114723645352
