In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder , FunctionTransformer
from sklearn.impute import SimpleImputer
from xgboost import XGBReggreisor as xgb
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier , GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression , SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from lazypredict.Supervised import LazyClassifier
from lightgbm import LGBMClassifier

In [None]:
data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isna().sum()

In [None]:
testing= test_data
data[['deck', 'num' , 'side']] = data['Cabin'].str.split('/', expand=True)[[0 , 1,  2]]
testing[['deck', 'num' , 'side']] = data['Cabin'].str.split('/', expand=True)[[0 , 1,  2]]
data['Group'] = data['PassengerId'].str.split('_' , expand = True)[0].astype(int)
test_data['Group'] = test_data['PassengerId'].str.split('_' , expand = True)[0].astype(int)

def wow (x):
  return x.astype(int)

fun = FunctionTransformer(wow)
X= data.drop(columns= ['Transported'])
y = data['Transported']

X_train , X_test , y_train , y_test =  train_test_split(X , y , test_size=0.15, random_state=42)


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

boolean_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')) ,
    ("bool_to_int" ,fun)
    ])

numeric_features = ["Spa", "Group" , "FoodCourt" , "VRDeck" , "RoomService" , "Age" , "ShoppingMall" , 'num']
categorical_features = ["deck" ,"side" , "HomePlanet" , "Destination"]
boolean_features = ["CryoSleep" , "VIP"]
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('bool', boolean_transformer, boolean_features)
        ])


preprocessor.fit(X_train)


X_train_preprocessed = preprocessor.transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)


preprocessor.fit(testing)
X_test2_preprocessed = preprocessor.transform(testing)

ohe_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_features)


feature_names = list(numeric_features) + list(boolean_features)  + list(ohe_feature_names)


X_train = pd.DataFrame(X_train_preprocessed, columns=feature_names)
X_test = pd.DataFrame(X_test_preprocessed, columns=feature_names)
testing = pd.DataFrame(X_test2_preprocessed, columns=feature_names)

X_train['num'] = X_train['num'].astype(int)
X_test['num'] =X_test['num'].astype(int)

testing['num']=testing['num'].astype(int)


X_train.head()
X_train



In [None]:
type(X_train['num'][10])

In [None]:
z = X_train
z['Transported'] = y_train.to_list()
X_train = X_train.drop(columns=['Transported'])
z.isna().sum()
hm = z.corr()
plt.figure(figsize=(16 , 9))
sb.heatmap(hm , annot=True , fmt=".2f"  )
plt.show()

In [None]:
bst = XGBClassifier(n_estimators=40, max_depth=6, learning_rate=0.1,min_child_weight= 5,subsample = 1,colsample_bytree= 0.75)
bst.fit(X_train , y_train)

In [None]:
y_pred = bst.score(X_test,y_test)
y_pred

0.7983128834355828

In [None]:
models = {
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.1, 1, 10, 100]
        }
    },
    'KNeighborsClassifier': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7, 9]
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1, 10, 100],
            'gamma': [0.1, 0.01, 0.001, 0.0001]
        }
    },
    'XGBClassifier': {
        'model': XGBClassifier(),
        'params': {
            'max_depth': [6, 8, 10],
            'learning_rate': [0.01, 0.05, 0.1],
            'n_estimators': [100, 200, 300]
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 5, 10, 15],
            'min_samples_split': [2, 5, 10]
        }
    },
    'GaussianNB': {
        'model': GaussianNB(),
        'params': {}
    },
    'GradientBoostingClassifier': {
        'model': GradientBoostingClassifier(),
        'params': {
            'learning_rate': [0.01, 0.1, 1],
            'n_estimators': [100, 200, 300],
            'max_depth': [3, 5, 8]
        }
    },
    'SGDClassifier': {
        'model': SGDClassifier(),
        'params': {
            'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
            'penalty': ['l2', 'l1', 'elasticnet']
        }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 5, 10, 15]
        }
    },
    'LGBMClassifier': {
        'model': LGBMClassifier(),
        'params': {
            'learning_rate': [0.01, 0.1, 1],
            'n_estimators': [20, 40, 60, 80, 100],
            'num_leaves': [31, 60, 90, 120]
        }
    }
}

best_params_scores = {}

for model_name, model_info in models.items():

    grid_clf = GridSearchCV(model_info['model'], model_info['params'], cv=5)
    grid_clf.fit(X_train, y_train)


    best_params_scores[model_name] = {
        'best_params': grid_clf.best_params_,
        'best_score': grid_clf.best_score_
    }

for model_name, params_score in best_params_scores.items():
    print(f"Model: {model_name}")
    print(f"Best parameters: {params_score['best_params']}")
    print(f"Best score: {params_score['best_score']}\n")


In [None]:
lgb = {
            'model': LGBMClassifier(),
            'params': {
            'learning_rate': [0.01, 0.1, 0.05, 1],
            'n_estimators': [20, 40, 60, 80, 100],
            'num_leaves': [31, 60, 90, 120],
            'max_depth': [4, 6],
            'colsample_bytree': [0.7, 0.8, 0.9],
            'subsample': [0.7, 0.8, 0.9],
            'min_child_samples': [1, 5, 10]
        }
}

model = GridSearchCV(lgb['model'], lgb['params'], cv=5 ,
                      n_jobs=-1,
                      scoring='neg_root_mean_squared_error')
model.fit(X_train, y_train)

best_params = model.best_estimator_
print(best_params)

In [None]:
lgb = LGBMClassifier(colsample_bytree=0.7, learning_rate=0.01, max_depth=4,
               min_child_samples=1, n_estimators=20, subsample=0.7)


lgbc.fit(X_train  , y_train)
lgbc.score(X_test , y_test)

# y_pred = lgbc.predict(testing)
# test_data['Transported'] =y_pred
# final_test = test_data[['PassengerId' , 'Transported']]
# final_test.head()
# final_test.to_csv("submission.csv" , index=False)

In [None]:
clf = LazyClassifier(verbose = 0 , ignore_warnings = True)
models , predictions = clf.fit(X_train, X_test , y_train , y_test)
models

In [None]:
models = {
    'lgbm' : LGBMClassifier(colsample_bytree=0.7, learning_rate=0.01, max_depth=4,
               min_child_samples=1, n_estimators=20, subsample=0.7),
    'random_forest' : RandomForestClassifier(max_depth =15 , min_samples_split=5 , n_estimators=300),
    'svc' : SVC(C=10 , gamma=0.1),
    'xgb' : XGBClassifier(learning_rate = 0.1 , max_depth=6 , n_estimators= 100)
}

for name , model in models.items() :
  mod= model
  mod.fit(X_train , y_train)
  score=mod.score(X_test, y_test)
  print(score)
  # y_pred = mod.predict(testing)
  # test_data['Transported'] = y_pred.astype(bool)
  # final_test = test_data[['PassengerId' , 'Transported']]

  # final_test.head()
  # final_test.to_csv(f"{name}.csv" , index=False)
