## IMPORTING THE LIBRARIES

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sb

## IMPORTING AND ANALYSIS THE DATASET

In [None]:
dataset = pd.read_csv('boxoffice.csv', encoding='latin-1')

In [None]:
to_remove = ['world_revenue', 'opening_revenue']    #only domestic_revenue is to be predicted 
dataset.drop(to_remove, axis=1, inplace=True)

In [None]:
print(dataset.info())

In [None]:
dataset.isnull().sum()                                  #checking if any columns have null values

### Filling The Empty Values

In [None]:
dataset['MPAA'] = dataset['MPAA'].fillna('Not Rated')
dataset['genres'] = dataset['genres'].fillna(dataset['genres'].mode()[0])

dataset['MPAA'].isnull().sum().sum()
dataset['genres'].isnull().sum().sum()

### Cleaning data (making str type to numeric)

In [None]:
dataset['budget'] = dataset['budget'].str[1:]    #removing the dollar sign from the inputs
dataset['domestic_revenue'] = dataset['domestic_revenue'].str[1:]
 
for col in ['domestic_revenue', 'opening_theaters', 'release_days', 'budget']:    #removes the commas in the string
    dataset[col] = dataset[col].str.replace(',', '')
 
    # Selecting rows with no null values
    # in the columns on which we are iterating.
    
    temp = (~dataset[col].isnull())
    dataset.loc[temp, col] = dataset.loc[temp, col].convert_dtypes(float)
 
    dataset[col] = pd.to_numeric(dataset[col], errors='coerce') #This line attempts to convert the entire column 'col' to numeric data. 
                                                                #The errors='coerce' argument means that any non-numeric values will be converted to NaN (Not-a-Number).

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = dataset.iloc[:, 3:4].values
imputer.fit(X)
# Transform and assign it back
dataset.iloc[:, 3:4] = imputer.transform(X)

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = dataset.iloc[:, 4:5].values
imputer.fit(X)
# Transform and assign it back
dataset.iloc[:, 4:5] = imputer.transform(X)

### Visualization

In [None]:
plt.figure(figsize=(20, 10))               # Count of different unique elements in column X
sb.countplot(data=dataset, x='distributor')
plt.show()

In [None]:
dataset.groupby('MPAA')['domestic_revenue'].mean()  # Mean Domestic_revenue for each unique element in X

## DATASET MODIFICATION

In [None]:
# Splitting the multiple genres in induvigual columns

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(dataset['genres'])
features = vectorizer.transform(dataset['genres']).toarray()

genres = vectorizer.get_feature_names_out()
for i, name in enumerate(genres):
    dataset[name] = features[:, i]

dataset.drop('genres', axis=1, inplace=True)

In [None]:
dataset.head()

In [None]:
# Removing columns having more
    # than 95% of the values as zero.
    
removed = 0
for col in dataset.loc[:, 'action':'western'].columns:
  
    if (dataset[col] == 0).mean() > 0.95:
        removed += 1
        dataset.drop(col, axis=1, inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder

for col in ['distributor', 'MPAA']:
	le = LabelEncoder()
	dataset[col] = le.fit_transform(dataset[col])

In [None]:
plt.figure(figsize=(15, 15))
sb.heatmap(dataset.iloc[:, 1:].corr()>0.8, annot=True, cbar=False)      #to check if any columns are related
plt.show()

In [None]:
dataset.head()

## SPLITTING TRAINING SET AND TEST SET

In [None]:
features = dataset.drop(['domestic_revenue', 'title'], axis=1)
target = dataset['domestic_revenue']

print(features)
print(target)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 1)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

# TRAINING MODEL

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

### Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

regressor = make_pipeline(PolynomialFeatures(4), LinearRegression())
regressor.fit(X_train, y_train)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(X_train, y_train)

### XGBOOST

In [None]:
from xgboost import XGBRegressor
regressor = XGBRegressor()
regressor.fit(X_train, y_train)

## PREDICTING TEST SET

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
# np.set_printoptions(precision=2)
# print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
results = list(zip(y_pred, y_test))
for item in results:
    print(item)

In [None]:
from sklearn.metrics import r2_score

r_squared = r2_score(y_test, y_pred)
print(f"Efficiency (R-SQUARED): {r_squared*100}%")

## PREDICTING SINGLE VALUE

In [None]:
# d = 1  #int(input("distributor: ")) 
# opening_theaters = 68    #int(input("opening_theaters: "))
# budget = 190   #int(input("budget: "))
# MPAA = 94   #int(input("MPAA: "))
# release_days = 29    #int(input("Enter duration of exercise: "))
# heart_rate = 105  #int(input("Enter heart rate: "))
# temp = 40.8        #int(input("Enter body tempertaure: "))
# print()

# result = regressor.predict([[d,opening_theaters,budget,weight,duration,heart_rate,temp]])[0]
# print(f"Calories Burnt: {result}")  #231.0