In [1]:
#importing necessary libraries
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
pd.set_option('display.max_columns', None)  # Show all columns in DataFrame output
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import IsolationForest

In [2]:
df = pd.read_csv('imdb_4.csv')
df.drop(columns = ['canHaveEpisodes'], inplace = True)
df

Unnamed: 0,startYear,runtimeMinutes,totalCredits,numRegions,ratingCount,castNumber,companiesNumber,writerCredits,directorsCredits,totalNominations,totalMedia,totalReviews,Asia,Africa,Europe,North America,South America,Oceania,Continent Unknown,genre1,genre2,genre3,movie,short,tvEpisode,tvMiniSeries,tvMovie,tvSeries,tvShort,tvSpecial,video,videoGame,rating_bin
0,1894,1,4,7,2092,1,3,0,1,0,11,19,1,0,5,1,0,0,0,16787,16581,0,0,1,0,0,0,0,0,0,0,0,1
1,1892,12,2,6,183,0,0,0,1,0,5,1,1,0,5,0,0,0,0,17855,16787,0,0,1,0,0,0,0,0,0,0,0,1
2,1894,1,1,5,195,0,1,0,1,0,5,0,0,0,4,1,0,0,0,16787,0,0,0,1,0,0,0,0,0,0,0,0,1
3,1894,1,4,6,2238,1,6,0,1,1,9,22,1,0,4,1,0,0,0,16787,16581,0,0,1,0,0,0,0,0,0,0,0,1
4,1896,1,11,21,13115,6,5,0,2,0,33,82,2,0,14,2,1,0,2,16787,16581,0,0,1,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149526,1993,96,11,1,11,0,2,1,1,0,1,0,0,0,0,0,1,0,0,16581,0,0,1,0,0,0,0,0,0,0,0,0,3
149527,2019,14,52,1,15,6,3,1,1,3,10,1,0,0,1,0,0,0,0,51745,16787,0,0,1,0,0,0,0,0,0,0,0,2
149528,2019,29,32,0,12,1,0,0,0,0,5,0,0,0,0,0,0,0,0,5726,0,0,0,0,1,0,0,0,0,0,0,0,1
149529,2011,10,15,0,10,7,0,3,1,0,1,0,0,0,0,0,0,0,0,47408,17855,17763,0,0,1,0,0,0,0,0,0,0,3


# 1% outlier

In [3]:
%%time
# Scale the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Initialize the IsolationForest model
isolation_forest = IsolationForest(n_estimators=100, contamination=0.01, random_state=42)

# Fit the model to the data
outlier_predictions = isolation_forest.fit_predict(df_scaled)

# Add the outlier predictions as a new column in the dataframe
df['ISF_Score'] = outlier_predictions

# Display the counts of inliers (1) and outliers (-1)
print(df['ISF_Score'].value_counts())

ISF_Score
 1    148035
-1      1496
Name: count, dtype: int64
CPU times: total: 1.8 s
Wall time: 1.96 s


In [4]:
# Remove anomalies (ISF_Score == -1) to get cleaned data
df_clean = df[df['ISF_Score'] == 1]

# Define features (X) and target (y)
X = df_clean.drop(columns=['rating_bin', 'ISF_Score'])  # Exclude target and LOF_Score columns
y = df_clean['rating_bin']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
%%time
# Initialize and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=50, metric='manhattan', weights = 'distance')  # You can adjust the number of neighbors
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.42
Classification Report:
              precision    recall  f1-score   support

           0       0.44      0.27      0.33      2594
           1       0.34      0.19      0.24      3881
           2       0.38      0.39      0.39      7241
           3       0.45      0.67      0.54      9689
           4       0.41      0.28      0.34      5064
           5       0.55      0.16      0.25      1138

    accuracy                           0.42     29607
   macro avg       0.43      0.33      0.35     29607
weighted avg       0.41      0.42      0.40     29607

CPU times: total: 2min 49s
Wall time: 3min 25s


In [6]:
%%time
dtc = DecisionTreeClassifier(random_state=42, criterion='gini', max_depth=10, min_samples_leaf=4, min_samples_split=10, splitter = 'random')# Initialize the Decision Tree Classifier
dtc.fit(X_train, y_train)

# Make predictions
y_pred = dtc.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.3940959908129834

Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.22      0.29      2594
           1       0.33      0.18      0.23      3881
           2       0.37      0.33      0.35      7241
           3       0.41      0.80      0.54      9689
           4       0.42      0.05      0.09      5064
           5       0.33      0.03      0.05      1138

    accuracy                           0.39     29607
   macro avg       0.38      0.27      0.26     29607
weighted avg       0.39      0.39      0.33     29607

CPU times: total: 516 ms
Wall time: 862 ms


# 5% outlier

In [7]:
%%time
# Scale the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Initialize the IsolationForest model
isolation_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)

# Fit the model to the data
outlier_predictions = isolation_forest.fit_predict(df_scaled)

# Add the outlier predictions as a new column in the dataframe
df['ISF_Score'] = outlier_predictions

# Display the counts of inliers (1) and outliers (-1)
print(df['ISF_Score'].value_counts())

ISF_Score
 1    142054
-1      7477
Name: count, dtype: int64
CPU times: total: 3.19 s
Wall time: 3.99 s


In [8]:
# Remove anomalies (ISF_Score == -1) to get cleaned data
df_clean = df[df['ISF_Score'] == 1]

# Define features (X) and target (y)
X = df_clean.drop(columns=['rating_bin', 'ISF_Score'])  # Exclude target and LOF_Score columns
y = df_clean['rating_bin']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
%%time
# Initialize and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=50, metric='manhattan', weights = 'distance')  # You can adjust the number of neighbors
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.42
Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.26      0.32      2400
           1       0.35      0.20      0.25      3708
           2       0.38      0.38      0.38      6954
           3       0.45      0.68      0.54      9361
           4       0.41      0.29      0.34      4908
           5       0.53      0.16      0.24      1080

    accuracy                           0.42     28411
   macro avg       0.42      0.33      0.35     28411
weighted avg       0.41      0.42      0.40     28411

CPU times: total: 2min 50s
Wall time: 4min 17s


In [10]:
%%time
dtc = DecisionTreeClassifier(random_state=42, criterion='gini', max_depth=10, min_samples_leaf=4, min_samples_split=10, splitter = 'random')# Initialize the Decision Tree Classifier
dtc.fit(X_train, y_train)

# Make predictions
y_pred = dtc.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.392981591637042

Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.22      0.29      2400
           1       0.31      0.14      0.19      3708
           2       0.36      0.32      0.34      6954
           3       0.41      0.81      0.54      9361
           4       0.42      0.05      0.09      4908
           5       0.42      0.06      0.11      1080

    accuracy                           0.39     28411
   macro avg       0.39      0.27      0.26     28411
weighted avg       0.39      0.39      0.33     28411

CPU times: total: 281 ms
Wall time: 373 ms


# 10% outlier

In [11]:
%%time
# Scale the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Initialize the IsolationForest model
isolation_forest = IsolationForest(n_estimators=100, contamination=0.1, random_state=42)

# Fit the model to the data
outlier_predictions = isolation_forest.fit_predict(df_scaled)

# Add the outlier predictions as a new column in the dataframe
df['ISF_Score'] = outlier_predictions

# Display the counts of inliers (1) and outliers (-1)
print(df['ISF_Score'].value_counts())

ISF_Score
 1    134578
-1     14953
Name: count, dtype: int64
CPU times: total: 1.83 s
Wall time: 1.96 s


In [12]:
# Remove anomalies (ISF_Score == -1) to get cleaned data
df_clean = df[df['ISF_Score'] == 1]

# Define features (X) and target (y)
X = df_clean.drop(columns=['rating_bin', 'ISF_Score'])  # Exclude target and LOF_Score columns
y = df_clean['rating_bin']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
%%time
# Initialize and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=50, metric='manhattan', weights = 'distance')  # You can adjust the number of neighbors
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.42
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.25      0.31      2103
           1       0.34      0.19      0.24      3338
           2       0.38      0.37      0.38      6539
           3       0.45      0.68      0.54      9047
           4       0.40      0.28      0.33      4803
           5       0.57      0.18      0.27      1086

    accuracy                           0.42     26916
   macro avg       0.42      0.32      0.34     26916
weighted avg       0.41      0.42      0.40     26916

CPU times: total: 1min 38s
Wall time: 1min 42s


In [14]:
%%time
dtc = DecisionTreeClassifier(random_state=42, criterion='gini', max_depth=10, min_samples_leaf=4, min_samples_split=10, splitter = 'random')# Initialize the Decision Tree Classifier
dtc.fit(X_train, y_train)

# Make predictions
y_pred = dtc.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.3878362312379254

Classification Report:
               precision    recall  f1-score   support

           0       0.34      0.24      0.28      2103
           1       0.34      0.10      0.16      3338
           2       0.35      0.33      0.34      6539
           3       0.41      0.78      0.54      9047
           4       0.41      0.07      0.12      4803
           5       0.61      0.04      0.07      1086

    accuracy                           0.39     26916
   macro avg       0.41      0.26      0.25     26916
weighted avg       0.39      0.39      0.33     26916

CPU times: total: 234 ms
Wall time: 247 ms
