In [1]:
import numpy as np
import pandas as pd
import datetime
from sklearn import svm
from sklearn import model_selection
from statsmodels.tools.eval_measures import mse

In [2]:
df = pd.read_csv('movies_dataset_processed.csv')
df

Unnamed: 0.1,Unnamed: 0,IMDb-rating,appropriate_for,director,downloads,industry,language,posted_date,release_date,run_time,storyline,title,views,writer,days_to_post,bucket
0,0,4.8,R,John Swab,304,Holywood,English,2023-02-20,2023-01-28,105,Doc\r\n facilitates a fragile truce between th...,Little Dixie,2794,John Swab,23,6.0
1,1,6.4,TV-PG,Paul Ziller,73,Holywood,English,2023-02-20,2023-02-05,84,Caterer\r\n Goldy Berry reunites with detectiv...,Grilling Season: A Curious Caterer Mystery,1002,John Christian Plummer,15,6.0
2,2,5.2,R,Ben Wheatley,1427,Holywood,"English,Hindi",2021-04-20,2021-06-18,107,As the world searches for a cure to a disastro...,In the Earth,14419,Ben Wheatley,59,7.0
3,3,6.5,R,Benjamin Caron,1781,Holywood,English,2023-02-13,2023-02-17,116,"Motivations are suspect, and expectations are ...",Sharper,18225,"Brian Gatewood, Alessandro Tanaka",4,4.0
4,4,6.9,PG-13,Ravi Kapoor,458,Holywood,English,2023-02-18,2022-12-02,80,An\r\n unmotivated South Asian American rapper...,Four Samosas,6912,Ravi Kapoor,78,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9897,9897,7.1,Not Rated,Biren Nag,1932,Bolywood,Hindi,1970-01-01,1962-05-11,158,"After a lusty Thakur rapes a young girl, she k...",Bees Saal Baad,6076,"Dhruva Chatterjee, Dev Kishan",2792,9.0
9898,9898,7.0,G,Guy Hamilton,2544,Holywood,"English,German,Polish,French",1970-01-01,1969-09-17,132,Historical reenactment of the air war in the e...,Battle of Britain,9319,"James Kennaway, Wilfred Greatorex, Derek Dempster",106,8.0
9899,9899,5.6,R,Barbara Topsøe-Rothenborg,12284,Holywood,"Spanish,German,English",2016-05-26,1970-01-01,90,"LOVE AT FIRST HICCUP is a charming, innocent, ...",Love at First Hiccup,36022,"Barbara Topsøe-Rothenborg, Søren Frellesen, De...",16947,10.0
9900,9900,7.1,Not Rated,Biren Nag,1932,Bolywood,Hindi,1970-01-01,1962-05-11,158,"After a lusty Thakur rapes a young girl, she k...",Bees Saal Baad,6077,"Dhruva Chatterjee, Dev Kishan",2792,9.0


In [3]:
df['release_year'] = pd.to_datetime(df['release_date']).dt.year
df['release_year']

0       2023
1       2023
2       2021
3       2023
4       2022
        ... 
9897    1962
9898    1969
9899    1970
9900    1962
9901    1969
Name: release_year, Length: 9902, dtype: int32

In [4]:
cols_to_label_enc = ['appropriate_for', 'director', 'industry']
label_encs = {}
for col in cols_to_label_enc:
    label_encs[col] = {}
    unique_values = df[col].unique()
    for i, value in enumerate(unique_values):
        label_encs[col][value] = i
    df[col] = df[col].apply(lambda x: label_encs[col][x])

In [5]:
df['downloads'] = df['downloads'].apply(lambda x: float(str(x).replace(',', '')))
df['views'] = df['views'].apply(lambda x: float(str(x).replace(',', '')))

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,IMDb-rating,appropriate_for,director,downloads,industry,language,posted_date,release_date,run_time,storyline,title,views,writer,days_to_post,bucket,release_year
0,0,4.8,0,0,304.0,0,English,2023-02-20,2023-01-28,105,Doc\r\n facilitates a fragile truce between th...,Little Dixie,2794.0,John Swab,23,6.0,2023
1,1,6.4,1,1,73.0,0,English,2023-02-20,2023-02-05,84,Caterer\r\n Goldy Berry reunites with detectiv...,Grilling Season: A Curious Caterer Mystery,1002.0,John Christian Plummer,15,6.0,2023
2,2,5.2,0,2,1427.0,0,"English,Hindi",2021-04-20,2021-06-18,107,As the world searches for a cure to a disastro...,In the Earth,14419.0,Ben Wheatley,59,7.0,2021
3,3,6.5,0,3,1781.0,0,English,2023-02-13,2023-02-17,116,"Motivations are suspect, and expectations are ...",Sharper,18225.0,"Brian Gatewood, Alessandro Tanaka",4,4.0,2023
4,4,6.9,2,4,458.0,0,English,2023-02-18,2022-12-02,80,An\r\n unmotivated South Asian American rapper...,Four Samosas,6912.0,Ravi Kapoor,78,7.0,2022


In [7]:
df_1 = df.drop(['storyline','title','writer','language','posted_date','release_date'],axis=1)

### Scaling the Variables

In [8]:
df_1.dtypes

Unnamed: 0           int64
IMDb-rating        float64
appropriate_for      int64
director             int64
downloads          float64
industry             int64
run_time             int64
views              float64
days_to_post         int64
bucket             float64
release_year         int32
dtype: object

In [9]:
df_1 = (df_1-df_1.min()) / (df_1.max() - df_1.min())
df_1.head()

Unnamed: 0.1,Unnamed: 0,IMDb-rating,appropriate_for,director,downloads,industry,run_time,views,days_to_post,bucket,release_year
0,0.0,0.45122,0.0,0.0,0.000777,0.0,0.28,0.001094,0.000788,0.555556,1.0
1,0.000101,0.646341,0.052632,0.000194,0.000187,0.0,0.21,0.0,0.000514,0.555556,1.0
2,0.000202,0.5,0.0,0.000388,0.003647,0.0,0.286667,0.008193,0.002021,0.666667,0.978261
3,0.000303,0.658537,0.0,0.000582,0.004552,0.0,0.316667,0.010518,0.000137,0.333333,1.0
4,0.000404,0.707317,0.105263,0.000776,0.001171,0.0,0.196667,0.003609,0.002672,0.666667,0.98913


In [10]:
correlations = df_1.corr()

In [11]:
target_correlations = correlations['bucket']
target_correlations

Unnamed: 0         0.045670
IMDb-rating        0.041140
appropriate_for    0.041154
director           0.124563
downloads         -0.138247
industry           0.083284
run_time           0.016308
views             -0.113743
days_to_post       0.502099
bucket             1.000000
release_year      -0.380979
Name: bucket, dtype: float64

In [12]:
X = df[['director','release_year']]
y = df['bucket']

### Train- Test Split

In [13]:
X

Unnamed: 0,director,release_year
0,0,2023
1,1,2023
2,2,2021
3,3,2023
4,4,2022
...,...,...
9897,5156,1962
9898,5151,1969
9899,375,1970
9900,5156,1962


In [14]:
X_train,X_test,y_train,y_test = model_selection.train_test_split(X,y,test_size =.3, random_state=1)

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
scaler = StandardScaler()

In [17]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

### Model Fit

In [18]:
h1 = svm.SVR()

In [19]:
h1.fit(X_train,y_train)

In [20]:
y_pred_1 = h1.predict(X_train)

In [21]:
print(mse(y_pred_1,y_train))

6.434770110082262


### Model- Evaluation

In [22]:
y_pred = h1.predict(X_test)

In [23]:
print(mse(y_pred,y_test))

6.259176746476148


In [24]:
h1.score(X_test,y_test)

0.06248554783395743

In [25]:
from sklearn.svm import SVR

In [26]:
svr_poly = SVR(kernel='poly',degree = 1)

In [27]:
svr_poly.fit(X_train,y_train)

In [28]:
y_pred_1 = svr_poly.predict(X_train)

In [29]:
print(mse(y_pred_1,y_train))

6.567328425341234


In [30]:
y_pred = svr_poly.predict(X_test)

In [31]:
print(mse(y_pred,y_test))

6.770213137177838


In [32]:
svr_poly.score(X_test,y_test)

-0.014058704113447229

In [33]:
svr_regressor = SVR(kernel='rbf')

In [34]:
svr_regressor.fit(X_train_scaled,y_train)

In [42]:
from sklearn.metrics import r2_score

y_pred = svr_regressor.predict(X_test_scaled)
r2 = r2_score(y_test,y_pred)
r2

0.22291222161818502

In [36]:
print(mse(y_test,y_pred))

5.188111757830075


In [37]:
svr_regressor.score(y_test,y_pred)

ValueError: Expected 2D array, got 1D array instead:
array=[ 9. 10.  7. ...  8.  9. 10.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [39]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVR())
])

In [40]:
param_grid = {
    'svm__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'svm__C': [0.1, 1, 10, 100],
    'svm__gamma': [0.1, 1, 10, 100],
    'svm__degree': [2, 3, 4, 5]
}

In [41]:
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)

KeyboardInterrupt: 