In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [9]:
df = pd.read_csv('data/corrected_fame_dataset.csv')
cols_to_drop = ['name','released']
df.drop(columns=cols_to_drop, inplace=True)

In [10]:
col_to_predict = 'is_movie_successful'
X =  df.drop(columns=col_to_predict) 
y = df[col_to_predict]

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


categorical_pipeline = Pipeline(
    steps=[
        ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ]
)

numerical_pipeline = Pipeline(
    steps=[
        ("scale", StandardScaler())
    ]
)

cat_cols = X.select_dtypes(exclude="number").columns
num_cols = X.select_dtypes(include="number").columns


full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numerical_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols)
    ]
)

In [15]:
import xgboost as xgb

xgb_cl = xgb.XGBClassifier()

print(type(xgb_cl))

<class 'xgboost.sklearn.XGBClassifier'>


In [21]:
X_processed = full_processor.fit_transform(X)
y_processed = y.map({'Yes': 1, 'No': 0}).values.reshape(-1, 1)


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_processed, stratify=y_processed, random_state=1121218
)

In [23]:
from sklearn.metrics import accuracy_score

xgb_cl.fit(X_train, y_train)

preds = xgb_cl.predict(X_test)

accuracy_score(y_test, preds)

1.0

In [25]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, preds)

array([[1003,    0],
       [   0,  914]], dtype=int64)

In [4]:
categorical_columns = ['rating', 'genre', 'year','director','writer','star','country','company','month_released']
X = pd.get_dummies(data=X, columns=categorical_columns)
y = y.map({'Yes': 1, 'No': 0})

In [5]:
X_train, X_val, y_train, y_val =  train_test_split(X, y, test_size=0.25, random_state=42)

In [6]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

(5751, 12841) (5751,)
(1917, 12841) (1917,)


In [7]:
from sklearn.ensemble import RandomForestClassifier
random_model = RandomForestClassifier()

In [8]:
random_model.fit(X_train, y_train)
y_pred = random_model.predict(X_val)

In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix

confusion_matrix(y_val, y_pred)

array([[1003,    2],
       [   1,  911]], dtype=int64)

In [10]:
accuracy_score(y_val, y_pred) * 100

99.8435054773083