In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from statistics import mean
from matplotlib import pyplot
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import pickle
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from sklearn.model_selection import GridSearchCV
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Reading Training CSV**


In [None]:
df = pd.read_csv("/kaggle/input/mycanaradacoe/Training data file.csv")
df.head()

In [None]:
df.shape

In [None]:
df['Target_Flag'].value_counts()

**Analyzing Null Values per Columns and dropping columns with highest Null Values**

In [None]:
df.isnull().sum().sort_values(ascending = False)

In [None]:
del df['CoAp_Income'], df['Max_Ratio_OC_Pending_POS']

In [None]:
df.shape

In [None]:
df.head()

**Analyzing Categorical Column and applying One Hot Encoding**

In [None]:
df.dtypes

In [None]:
df['Ever_Default_L12M'].nunique()

In [None]:
df_ohe = pd.get_dummies(df, columns=['Ever_Default_L12M'], prefix='Ever_Default_L12M', prefix_sep='_',dtype = int)
df = df_ohe

In [None]:
df.shape

In [None]:
df.head()

**Rearranging Columns and changing all numeric values to float64**

In [None]:
cols = list(df.columns.values)
cols = [cols[0]] + [cols[18]] + cols[1:18] + cols[19:21]
df = df[cols]
df.head()

In [None]:
df.iloc[:,1:] = df.iloc[:,1:].astype('float64')

In [None]:
df.head()

**Filling Missing Values with Mode of that column**

In [None]:
for i in range(2,21):
    df.iloc[:,i] = df.iloc[:,i].fillna(df.iloc[:,i].mode()[0])

In [None]:
df.head()

**Using StandardScaler() to normalize our numeric columns**

In [None]:
df.iloc[:, 2:19] = StandardScaler().fit_transform(df.iloc[:, 2:19])

In [None]:
df.head()

In [None]:
df.to_csv("preprocessed_csv.csv", index = False)

In [None]:
df = pd.read_csv('/kaggle/input/preprocessed-train-csv/preprocessed_csv.csv')
df.head()

In [None]:
dfm = df.values
X = dfm[:,2:]
y = dfm[:,1]

**over sampling using SMOTE**

In [None]:
oversample = SMOTE()
over_X, over_y = oversample.fit_resample(X, y)
over_X_train, over_X_test, over_y_train, over_y_test = train_test_split(over_X, over_y, test_size=0.2, stratify=over_y)

**Using Grid Search cv to fine tune our model parameters**

In [None]:
SMOTE_SRF = RandomForestClassifier(random_state=42) #class_weight to handle skewed dataset
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 6, 7, 8],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]    # Minimum number of samples required at each leaf node
}
grid_search = GridSearchCV(estimator=SMOTE_SRF, param_grid=param_grid, 
                           scoring='roc_auc', cv=5, n_jobs=-1, verbose=2)
grid_search.fit(over_X_train, over_y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_roc_auc = grid_search.best_score_

In [None]:
best_model = RandomForestClassifier(max_depth = None, min_samples_leaf = 1, min_samples_split = 2, n_estimators = 300, random_state=42)
best_model.fit(over_X_train, over_y_train)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

**Training and Prediction part**

In [None]:
feature_importances = best_model.feature_importances_
feature_names = df.columns[2:]
plt.figure(figsize=(10, 6))
plt.barh(range(len(feature_importances)), feature_importances, align='center')
plt.yticks(range(len(feature_importances)), feature_names)  # Optional if you have feature names
plt.xlabel('Feature Importance')
plt.title('Random Forest Classifier - Feature Importance')
plt.show()


In [None]:
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

**Classifier Report and AUC Score**

In [None]:
report = classification_report(y_test, y_pred)
roc = roc_auc_score(y_test,y_prob)
#print(f"best_parameters {best_params}")
print(report)
print(f"AUC Score: {roc}")

**Reading Validation dataset and preprocessing like we did for Training dataset**

In [None]:
df1 = pd.read_csv("/kaggle/input/mycanaradacoe/Validation Data file.csv")
df1.head()

In [None]:
df1.shape

In [None]:
df1.isnull().sum().sort_values(ascending = False)

In [None]:
del df1['CoAp_Income'], df1['Max_Ratio_OC_Pending_POS']

In [None]:
df_ohe = pd.get_dummies(df1, columns=['Ever_Default_L12M'], prefix='Ever_Default_L12M', prefix_sep='_',dtype = int)
df1 = df_ohe

In [None]:
df1.iloc[:,1:] = df1.iloc[:,1:].astype('float64')

In [None]:
df1.head()

In [None]:
df1.shape

In [None]:
for i in range(1,20):
    df1.iloc[:,i] = df1.iloc[:,i].fillna(df1.iloc[:,i].mode()[0])

In [None]:
df1.iloc[:, 1:18] = StandardScaler().fit_transform(df1.iloc[:, 1:18])

In [None]:
df1.head()

In [None]:
df1.to_csv("preprocessed_val_csv.csv", index = False)

In [None]:
df1 = pd.read_csv('/kaggle/input/preprocessedvalpart/preprocessed_val_csv.csv')
df1.head()

In [None]:
val = df1.values
X = val[:,1:]

**Predicting values and Probability of Validation dataset**

In [None]:
y_pred_val = best_model.predict(X)
y_prob_val = best_model.predict_proba(X)[:, 1]

**Creating our final output dataframe and then exporting it**

In [None]:
fields = {
    'Customer_No': val[:,0],
    'Prediction' : y_pred_val,
    'Probability_Prediction': y_prob_val
}

In [None]:
final = pd.DataFrame(fields)
final['Customer_No'] = final['Customer_No'].astype(int)
final['Prediction'] = final['Prediction'].astype(int)
final.to_csv("ekshanrajverma_iithyderabad.csv", index = False)

In [None]:
df2 = pd.read_csv("/kaggle/input/final-result/ekshanrajverma_iithyderabad.csv")
df2.head()

In [None]:
df2.shape

In [None]:
dfs = pd.read_csv("/kaggle/working/customer_info.csv")
dfs.head()

In [None]:
dfs['E-mail'][4] = 'ch20btech11012@iith.ac.in'
del dfs['Unnamed: 0']

In [None]:
dfs.to_csv('customer_info.csv',index = False)

In [None]:
dfk = dfs.merge(df2[df2['Prediction'] == 1], how = 'inner',on = 'Customer_No')
dfk.head()

In [None]:
pip install openai

In [None]:
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.base import MIMEBase
from email import encoders
import os
import openai
openai.organization = "org-2e3Fpa6gASZY8IXkNHXwOyIY"
openai.api_key = "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"
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,messages=messages,temperature=0,)
    return response.choices[0].message["content"]

In [None]:
prompt1 = "EMI Payment Reminder (mail body) in 300 words (without subject) for IDFC bank customer"

for name,email in zip(dfk['Name'],dfk['E-mail']):
    prompt2 = "customer name is" + name
    prompt3 = "Mail Subject for EMI Repayment Reminder"
    mail_content = get_completion(prompt1 + prompt2)
    subject = get_completion(prompt3)
    sender_address = 'akshanrajverma@gmail.com'
    sender_pass = 'ljly wmab ynel eylc'
    reciever_address = email
    message = MIMEMultipart()
    message['From'] = sender_address
    message['To'] = reciever_address
    message['Subject'] = subject
    message.attach(MIMEText(mail_content, 'plain'))
    '''attach_file_name = 'sample.pdf'
    attach_file = open(attach_file_name,'rb')
    payload = MIMEBase('application', 'octate-stream')
    payload.set_payload(attach_file.read())
    encoders.encode_base64(payload)
    payload.add_header('Content-Decomposition','attachment',filename = attach_file_name)
    message.attach(payload)'''
    
    session = smtplib.SMTP('smtp.gmail.com',587)
    session.starttls()
    session.login(sender_address, sender_pass)
    text = message.as_string()
    session.sendmail(sender_address,reciever_address,text)
    session.quit()
    print('Mail Sent to ' + name)


