In [41]:
#solution 1

In [42]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

train_data = pd.read_excel("D://termdeposit_trains.xlsx")
test_data = pd.read_excel("D://termdeposit_tests.xlsx")

label_encoders = {}
binary_cols = ['default', 'housing', 'loan', 'subscribed']
for col in binary_cols:
    label_encoders[col] = LabelEncoder()
    train_data[col] = label_encoders[col].fit_transform(train_data[col])
    if col in test_data.columns:
        test_data[col] = label_encoders[col].transform(test_data[col])

train_data = pd.get_dummies(train_data, columns=['job', 'marital', 'education', 'contact', 'month', 'day', 'poutcome'])
test_data = pd.get_dummies(test_data, columns=['job', 'marital', 'education', 'contact', 'month', 'day', 'poutcome'])

test_data = test_data.reindex(columns=train_data.columns.drop('subscribed'), fill_value=0)

X_train = train_data.drop('subscribed', axis=1)
y_train = train_data['subscribed']

imputer = SimpleImputer(strategy='mean')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
test_data = pd.DataFrame(imputer.transform(test_data), columns=test_data.columns)

scaler = StandardScaler()
numerical_cols = ['age', 'duration', 'campaign', 'pdays', 'previous']
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
test_data[numerical_cols] = scaler.transform(test_data[numerical_cols])


In [8]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score

X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

log_reg = LogisticRegression(max_iter=1000)
rf_clf = RandomForestClassifier()
gb_clf = GradientBoostingClassifier()

models = [log_reg, rf_clf, gb_clf]
for model in models:
    model.fit(X_train_split, y_train_split)
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)[:, 1]
    print(f'{model.__class__.__name__} Classification Report:')
    print(classification_report(y_val, y_pred))
    print(f'ROC-AUC: {roc_auc_score(y_val, y_pred_proba)}\n')

best_model = gb_clf 

best_model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      5599
           1       0.64      0.41      0.50       731

    accuracy                           0.91      6330
   macro avg       0.78      0.69      0.73      6330
weighted avg       0.89      0.91      0.90      6330

ROC-AUC: 0.9239533442189329

RandomForestClassifier Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      5599
           1       0.67      0.43      0.52       731

    accuracy                           0.91      6330
   macro avg       0.80      0.70      0.74      6330
weighted avg       0.90      0.91      0.90      6330

ROC-AUC: 0.9407337493577244

GradientBoostingClassifier Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      5599
           1       0.63      0.51      0.

In [10]:
test_predictions = best_model.predict(test_data)

submission = pd.DataFrame({'ID': test_data['ID'], 'subscribed': test_predictions})
submission['subscribed'] = submission['subscribed'].apply(lambda x: 'yes' if x == 1 else 'no')
submission.to_csv('submission.csv', index=False)


In [13]:
submission


Unnamed: 0,ID,subscribed
0,38441.0,no
1,40403.0,yes
2,3709.0,no
3,37422.0,no
4,12527.0,no
...,...,...
13559,23465.0,no
13560,11743.0,no
13561,28292.0,no
13562,45163.0,yes


In [40]:
#solution 2

In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from lightgbm import LGBMRegressor

# Load the datasets
train_data = pd.read_excel("D://Data_Train.xlsx")
test_data = pd.read_excel("D://Data_Test.xlsx")

# Replace non-numeric values with NaN in numerical columns
train_data['RATING'].replace(['-', 'NEW'], np.nan, inplace=True)
test_data['RATING'].replace(['-', 'NEW'], np.nan, inplace=True)
train_data['VOTES'].replace('-', np.nan, inplace=True)
test_data['VOTES'].replace('-', np.nan, inplace=True)

# Remove non-numeric characters from the 'VOTES' column and convert to float
train_data['VOTES'] = train_data['VOTES'].str.extract('(\d+)').astype(float)
test_data['VOTES'] = test_data['VOTES'].str.extract('(\d+)').astype(float)

# Convert the columns to numeric
train_data['RATING'] = pd.to_numeric(train_data['RATING'])
test_data['RATING'] = pd.to_numeric(test_data['RATING'])

# Handling missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
train_data[['RATING', 'VOTES']] = imputer.fit_transform(train_data[['RATING', 'VOTES']])
test_data[['RATING', 'VOTES']] = imputer.transform(test_data[['RATING', 'VOTES']])

# Separate features and target variable
X = train_data.drop(['COST'], axis=1)
y = train_data['COST']
X_test = test_data.copy()

# Categorical columns for encoding
categorical_cols = ['TITLE', 'CUISINES', 'TIME', 'CITY', 'LOCALITY']

# Preprocessing pipeline for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['RATING', 'VOTES']),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Transform the data
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test)

# Splitting data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Define LightGBM model and parameter grid
lgb_regressor = LGBMRegressor(random_state=42)

lgb_param_grid = {
    'n_estimators': [100, 150],
    'learning_rate': [0.01, 0.1],
    'num_leaves': [31, 40]
}

# Perform GridSearchCV for LightGBM
lgb_grid_search = GridSearchCV(estimator=lgb_regressor, param_grid=lgb_param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)

# Fit the model
lgb_grid_search.fit(X_train, y_train)

# Get the best model
best_lgb = lgb_grid_search.best_estimator_

# Print best parameters found
print("Best LightGBM parameters:", lgb_grid_search.best_params_)

# Evaluate the best model on the validation set
y_pred = best_lgb.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print(f'LightGBM MSE on validation set: {mse}')

# Assuming LightGBM performed the best, train on full data and predict on test dataset
best_lgb.fit(X_processed, y)
test_predictions = best_lgb.predict(X_test_processed)

# Prepare the submission file
submission = pd.DataFrame({'RESTAURANT_ID': test_data['RESTAURANT_ID'], 'COST': test_predictions})
submission.to_csv('submission.csv', index=False)


Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002322 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 957
[LightGBM] [Info] Number of data points in the train set: 10152, number of used features: 338
[LightGBM] [Info] Start training from score 659.003940
Best LightGBM parameters: {'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}
LightGBM MSE on validation set: 99420.91099674524
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003912 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1141
[LightGBM] [Info] Number of data points in the train set: 12690, number of used features: 430
[LightGBM] [Info] Start training from sc

In [34]:
submission


Unnamed: 0,RESTAURANT_ID,COST
0,4085,1205.218901
1,12680,378.558179
2,1411,690.885024
3,204,565.323370
4,13453,257.461645
...,...,...
4226,9057,1052.283964
4227,1247,494.070318
4228,8617,353.987812
4229,6485,295.733456
