In [None]:
#Library

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import klib


from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, classification_report
from sklearn.model_selection import learning_curve
from sklearn.feature_selection import mutual_info_regression
from imblearn.over_sampling import SMOTE


from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, StackingRegressor, BaggingRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier, BaggingClassifier,
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier



# Handling with Missing value

In [None]:
#Filling

for column in data_f.columns:
    if data_f[column].isnull().sum()>0:
        if data_f[column].dtype=='object':
            data_f[column].fillna(data_f[column].mode()[0], inplace=True)
        else:
            data_f[column].fillna(data_f[column].mean(), inplace=True)

data_f.info()

In [None]:
#Dropping

df = dfdf = df.dropna(subset=['col1', 'col2'])


# Coding

In [None]:
categorical_col = df.select_dtypes(include={'object', 'category'}).columns

#Label encoding

label_encoder = LabelEncoder()
for column in data_f.columns:
  data_f[column] = label_encoder.fit_transform(data_f[column])


In [None]:
#get-demus

cordinality = df[categorical_col].nunique()


In [None]:

encoded_df = pd.get_dummies(df[categorical_col], drop_first=False)
df = df.drop(columns=categorical_col)
df = pd.concat([df, encoded_df], axis=1)
df = df.astype(int)

In [None]:
#One-hot encoding

encoder = OneHotEncoder()
encoded_array = encoder.fit_transform(df[categorical_col])
encoded_df = pd.DataFrame(encoded_array.toarray(), columns=encoder.get_feature_names_out(categorical_col))
df = df.drop(columns=categorical_col)
df = pd.concat([df, encoded_df], axis=1)

In [None]:
# Frequency Encoding

for col in ['Director', 'Writer', 'Cast']:
    df[col + '_freq'] = df[col].map(df[col].value_counts())

df.drop(['Director','Writer','Cast'], axis=1, inplace=True)


In [None]:
# Target Encoding

country_mean = df.groupby('Country of Origin')['Worldwide Gross'].mean()
df['Country_Encoded'] = df['Country of Origin'].map(country_mean)

df.drop(['Country of Origin','Languages'], axis=1, inplace=True)


# Data monupulation

In [None]:

df[col] = pd.to_numeric(df[col], errors='coerce')

if df[col].isnull().sum()>0:
  print(f" Found {df[col].isnull().sum()} non-numerical or missing in 'Price'.")

  df[col].fillna(df[col].mean(), inplace=True)

  print('\nAfter cleanig: ')
  print(df[col].head())

In [None]:
#Convert object->numerical

df[col] = df[col].map({"Yes": 1, "No": 0})


In [None]:
#date -> int/float
df_cleaned['Policy Start Date']= pd.to_datetime(df_cleaned['Policy Start Date'], errors='coerce')

df_cleaned['Policy Start Year'] = df_cleaned['Policy Start Date'].dt.year
df_cleaned['Policy Start Month'] = df_cleaned['Policy Start Date'].dt.month
df_cleaned['Policy Start Day'] = df_cleaned['Policy Start Date'].dt.day

df_cleaned.drop(columns=['Policy Start Date'], inplace=True)

In [None]:
#Cleaning symbols for numerical columns


columns_to_clean = []

def clean_numeric_column(column):

    if column.dtype != 'object':
        column = column.astype(str)
    return (
        column.str.replace(r'[^\d.]', '', regex=True).replace('', np.nan)
              .astype(float)
    )

for col in columns_to_clean:
    if col in df.columns:
        df[col] = clean_numeric_column(df[col])


print(df.dtypes)

# Scaling

In [None]:
#Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Final data prepros

In [None]:
train= klib.data_cleaning(train)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

In [None]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Models

In [None]:
#Logistic Regression

log_reg = LogisticRegression(max_iter=1000,solver='lbfgs')
log_reg.fit(x_train, y_train)
y_pred = log_reg.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

In [None]:
#Liner Regression

lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
y_pred = lin_reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared: {r2:.2f}')

In [None]:
#Decision Tree Classifier

d_tree = DecisionTreeClassifier(random_state=42)
d_tree.fit(x_train, y_train)
y_pred = model.predict(x_test_scaled)
accuracy = accuracy_score(y_test,y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

In [None]:
d_tree = DecisionTreeRegressor(random_state=42)
d_tree.fit(x_train, y_train)
y_pred = model.predict(x_test_scaled)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared: {r2:.2f}')

In [None]:
#Random Forest Classifier

r_forest = RandomForestClassifier(n_estimators=100, random_state=42)
r_forest.fit(x_train, y_train)
y_pred = r_forest.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

In [None]:
r_forest = RandomForestRegressor(n_estimators=100, random_state=42)
r_forest.fit(x_train, y_train)
y_pred = r_forest.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared: {r2:.2f}')

In [None]:
#KNeighboorns Classifier

knn = KNeighborsClassifier(n_neighbors=5)  # k= 5 nearest neighbors
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

In [None]:
knn = KNeighborsRegressor(n_neighbors=5)  # k= 5 nearest neighbors
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared: {r2:.2f}')

In [None]:
#AdaBoost Classifier

ada_boost = AdaBoostClassifier(n_estimators=100, random_state=42)
ada_boost.fit(x_train, y_train)
y_pred = ada_boost.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

# Advanced ensemble learning techniques

In [None]:
#Stacking


# Base models
base_models = [
    ('decision_tree', DecisionTreeClassifier(random_state=42)),
    ('random_forest', RandomForestClassifier(random_state=42, n_estimators=100)),
]

# 1. Stacking
stacking_model = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression(), cv=5)
stacking_model.fit(X_train, y_train)
stacking_preds = stacking_model.predict(X_test)
# Stacking accuracy
stacking_accuracy = accuracy_score(y_test, stacking_preds)
print(f"Stacking Accuracy: {stacking_accuracy:.2f}")

In [None]:
#Blending

X_train_blend, X_val, y_train_blend, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

#Train base models
for name, model in base_models:
    model.fit(X_train_blend, y_train_blend)

val_preds = np.column_stack([model.predict(X_val) for _, model in base_models])


meta_model = LogisticRegression()
meta_model.fit(val_preds, y_val)

test_preds = np.column_stack([model.predict(X_test) for _, model in base_models])
blending_preds = meta_model.predict(test_preds)
#Blending accuracy
blending_accuracy = accuracy_score(y_test, blending_preds)
print(f"Blending Accuracy: {blending_accuracy:.2f}")

In [None]:
# Bagging

bagging_model = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, random_state=42)
bagging_model.fit(X_train, y_train)
bagging_pred = bagging_model.predict(X_test)
bagging_accuracy = accuracy_score(y_test, bagging_pred)
print(f"Bagging Classifier Accuracy: {bagging_accuracy:.4f}")

In [None]:
# Boosting (AdaBoost)

boosting_model = AdaBoostClassifier(n_estimators=50, random_state=42)
boosting_model.fit(X_train, y_train)
boosting_pred = boosting_model.predict(X_test)
boosting_accuracy = accuracy_score(y_test, boosting_pred)
print(f"Boosting Classifier (Ada Boosting) Accuracy: {boosting_accuracy:.4f}")

# Validation

In [None]:
# Perform K-fold cross-validation (k=5)

cv_scores = cross_val_score(model, x, y, cv=5)  # cv=5 means 5-fold cross-validation
avg_cv_scores=np.mean(cv_scores)

avg_cv_scores

# Improving

In [None]:
#Mutual Score
mi_scores = mutual_info_regression(X, y)

mi_scores_df = pd.DataFrame(mi_scores, index=X.columns, columns=['MI Score'])
mi_scores_df = mi_scores_df.sort_values(by='MI Score', ascending=False)

print(mi_scores_df)

In [None]:
def plot_mi_scores(scores):
  scores = scores.sort_values(ascending=True)
  width = np.arange(len(scores))
  ticks = list(scores.index)

  plt.barh(width, scores)
  plt.yticks(width, ticks)
  plt.title("Mutual Information Scores")
  plt.xlabel("Mutual Information Scores")
  plt.ylabel("Features")
  plt.show()


plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores_df["MI Score"])


# Charts

In [None]:
#Bar chart

plt.bar(x, y, width=0.5)
plt.title("Title")
plt.xlabel("X-axis label")
plt.ylabel("Y-axis label
plt.show()

In [None]:
#Learning Curve

train_sizes, train_scores, val_scores = learning_curve(model, x_train_scaled, y_train, cv=5, scoring='accuracy', train_sizes=[0.1, 0.33, 0.55, 0.78, 1.0])

train_mean = train_scores.mean(axis=1)
val_mean = val_scores.mean(axis=1)


plt.figure(figsize=(8, 5))
plt.plot(train_sizes, train_mean, label='Training Accuracy', marker='o')
plt.plot(train_sizes, val_mean, label='Validation Accuracy', marker='o')
plt.xlabel('Training Set Size')
plt.legend()
plt.title('Learning Curves for Random Forest Classifier')
plt.show()

In [None]:
#Correlation

X = df.copy()
correlation_matrix = X.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix[['target']], annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation of Features with target")
plt.show()

# Check the correlation between all features to spot any potential multicollinearity
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Matrix of Features")
plt.show()