In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Importing necessary packages:

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import sklearn.preprocessing as preprocessing 
import sklearn.metrics as metrics 
import sklearn.calibration as calibration 
import sklearn.linear_model as linear_model 
import sklearn.svm as svm 
import sklearn.naive_bayes as naive_bayes
import sklearn.model_selection as model_selection

## Reading the necessary datasets:

In [None]:
train_dataset = pd.read_csv("/kaggle/input/iiitb-ai511group-3-classification/train.csv")
test_dataset = pd.read_csv("/kaggle/input/iiitb-ai511group-3-classification/test.csv")

combined_dataset = train_dataset.append(test_dataset)

In [None]:
combined_dataset.head()

In [None]:
combined_dataset.describe()

In [None]:
combined_dataset.isna().sum()

## Checking for columns with Nan values:

In [None]:
for column in combined_dataset.columns:
    if(combined_dataset[column].isna().sum() > 0):
        print(column, combined_dataset[column].isna().sum(), combined_dataset[column].dtype)

## Taking care of na values:

In [None]:
combined_dataset.drop(columns = ["Personal76", "Property34"], inplace=True)

In [None]:
for column in combined_dataset.columns:
    if(combined_dataset[column].isna().sum() > 0 and column != "Conversion_result"):
        combined_dataset[column].fillna(combined_dataset[column].value_counts().idxmax(), inplace=True)

In [None]:
for column in combined_dataset.columns:
    if(combined_dataset[column].isna().sum() > 0):
        print(column, combined_dataset[column].isna().sum(), combined_dataset[column].dtype)

In [None]:
categorical_columns = combined_dataset.select_dtypes(include=['object']).columns.tolist()
print(categorical_columns)

In [None]:
## The 'Field7' column has numbers but is of type 'object', taking steps to fix that.

In [None]:
combined_dataset['Field7'] = combined_dataset['Field7'].str.replace(',', '')
combined_dataset = combined_dataset.astype({"Field7": int})

In [None]:
categorical_columns = combined_dataset.select_dtypes(include=['object']).columns.tolist()

print(categorical_columns)

for column in categorical_columns:
    if combined_dataset[column].nunique() == 2:
        combined_dataset = combined_dataset.replace({column: {"Y" : 1, "N" : 0}})   #Encoding categorical columns with 'Y' and 'N' as values.
        
combined_dataset = combined_dataset.replace({'Geographic63': {"Y" : 1, "N" : 0, " " : 1}})

In [None]:
#Continuous variables Correlation.
# Removing columns which have correlation greater than 0.95
categorical_columns = combined_dataset.select_dtypes(include=['object']).columns.tolist()
continuous_columns = combined_dataset.drop(columns = categorical_columns)
continuous_columns = continuous_columns.drop(columns = ["keyValue", "Conversion_result"])
col_corr = set()
corr_matrix = continuous_columns.corr()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if (abs(corr_matrix.iloc[i, j]) >= 0.95) and (corr_matrix.columns[j] not in col_corr):
            colname = corr_matrix.columns[i]
            col_corr.add(colname)
            if colname in combined_dataset.columns:
                combined_dataset.drop(columns = colname, inplace=True)

In [None]:
combined_dataset.describe()

In [None]:
combined_dataset.drop(columns = categorical_columns, inplace = True)

In [None]:
train_dataset = combined_dataset.loc[combined_dataset["Conversion_result"].isna() == False,:]
test_dataset = combined_dataset.loc[combined_dataset["Conversion_result"].isna(),:].drop(columns="Conversion_result")
train_dataset = train_dataset.drop(['keyValue'], axis = 1)

In [None]:
main_x = train_dataset.drop(columns="Conversion_result").to_numpy().astype("float")
main_y = train_dataset["Conversion_result"].to_numpy().astype("float")
test = test_dataset.drop(['keyValue'], axis = 1)
kaggle_test_x = test.to_numpy().astype("float")

In [None]:
## Stratified K Fold for testing.

skf = model_selection.StratifiedKFold(n_splits = 4)
scaler = preprocessing.StandardScaler()
model = linear_model.LogisticRegression(max_iter=1000, C = 0.1, class_weight = 'balanced', solver = 'saga')

for train_index, test_index in skf.split(main_x, main_y): 
    x_train, x_test = main_x[train_index], main_x[test_index]
    y_train, y_test = main_y[train_index], main_y[test_index]
    x_scaled = scaler.fit_transform(x_train)
    model.fit(x_scaled, y_train)
    test_x_scaled = scaler.transform(x_test)
    yhat = model.predict(test_x_scaled)
    print(metrics.f1_score(yhat, y_test))

In [None]:
x_scaled = scaler.fit_transform(main_x)
model.fit(x_scaled, main_y)
test_x_scaled = scaler.transform(kaggle_test_x)
kaggle_yhat = model.predict(test_x_scaled)

test_dataset['Conversion_result'] = kaggle_yhat
test_dataset['Conversion_result'] = test_dataset['Conversion_result'].astype("int")
to_submit = test_dataset[["keyValue", "Conversion_result"]]
to_submit.to_csv("2019508_fifth_submission.csv", index=False)