In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


In [3]:
credit_df=pd.read_csv('Credit_card.csv')
credit_df.head()


Unnamed: 0,Ind_ID,GENDER,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members
0,5008827,M,Y,Y,0,180000.0,Pensioner,Higher education,Married,House / apartment,-18772.0,365243,1,0,0,0,,2
1,5009744,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2
2,5009746,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,,-586,1,1,1,0,,2
3,5009749,F,Y,N,0,,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2
4,5009752,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2


In [4]:
credit_df.dtypes

Ind_ID               int64
GENDER                 str
Car_Owner              str
Propert_Owner          str
CHILDREN             int64
Annual_income      float64
Type_Income            str
EDUCATION              str
Marital_status         str
Housing_type           str
Birthday_count     float64
Employed_days        int64
Mobile_phone         int64
Work_Phone           int64
Phone                int64
EMAIL_ID             int64
Type_Occupation        str
Family_Members       int64
dtype: object

In [5]:
# 1. Data cleaning
credit_df = credit_df.drop(columns=['Ind_ID','Mobile_phone','Work_Phone','Phone','EMAIL_ID'])

In [6]:
# Convert categorical variables to numeric
credit_df = pd.get_dummies(credit_df, columns=['GENDER', 'Car_Owner', 'Propert_Owner', 'Type_Income','EDUCATION', 'Marital_status','Housing_type','Type_Occupation'], drop_first=True)
credit_df.head()

Unnamed: 0,CHILDREN,Annual_income,Birthday_count,Employed_days,Family_Members,GENDER_M,Car_Owner_Y,Propert_Owner_Y,Type_Income_Pensioner,Type_Income_State servant,...,Type_Occupation_Laborers,Type_Occupation_Low-skill Laborers,Type_Occupation_Managers,Type_Occupation_Medicine staff,Type_Occupation_Private service staff,Type_Occupation_Realty agents,Type_Occupation_Sales staff,Type_Occupation_Secretaries,Type_Occupation_Security staff,Type_Occupation_Waiters/barmen staff
0,0,180000.0,-18772.0,365243,2,True,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
1,0,315000.0,-13557.0,-586,2,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,315000.0,,-586,2,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,,-13557.0,-586,2,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0,315000.0,-13557.0,-586,2,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
# replace NAN with the median
credit_df['Birthday_count']=credit_df['Birthday_count'].fillna(credit_df['Birthday_count'].median())
credit_df['Annual_income']=credit_df['Annual_income'].fillna(credit_df['Annual_income'].median())
credit_df['CHILDREN']=credit_df['CHILDREN'].fillna(credit_df['CHILDREN'].median())


In [8]:
credit_df.head()

Unnamed: 0,CHILDREN,Annual_income,Birthday_count,Employed_days,Family_Members,GENDER_M,Car_Owner_Y,Propert_Owner_Y,Type_Income_Pensioner,Type_Income_State servant,...,Type_Occupation_Laborers,Type_Occupation_Low-skill Laborers,Type_Occupation_Managers,Type_Occupation_Medicine staff,Type_Occupation_Private service staff,Type_Occupation_Realty agents,Type_Occupation_Sales staff,Type_Occupation_Secretaries,Type_Occupation_Security staff,Type_Occupation_Waiters/barmen staff
0,0,180000.0,-18772.0,365243,2,True,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
1,0,315000.0,-13557.0,-586,2,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,315000.0,-15661.5,-586,2,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0,166500.0,-13557.0,-586,2,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0,315000.0,-13557.0,-586,2,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [9]:
correlation = credit_df.corr()
correlation

Unnamed: 0,CHILDREN,Annual_income,Birthday_count,Employed_days,Family_Members,GENDER_M,Car_Owner_Y,Propert_Owner_Y,Type_Income_Pensioner,Type_Income_State servant,...,Type_Occupation_Laborers,Type_Occupation_Low-skill Laborers,Type_Occupation_Managers,Type_Occupation_Medicine staff,Type_Occupation_Private service staff,Type_Occupation_Realty agents,Type_Occupation_Sales staff,Type_Occupation_Secretaries,Type_Occupation_Security staff,Type_Occupation_Waiters/barmen staff
CHILDREN,1.0,0.078543,0.277944,-0.219095,0.890248,0.063068,0.063467,-0.003352,-0.217467,0.035128,...,0.022808,-0.007826,0.052493,0.029932,0.031794,0.027202,0.017414,0.014061,-0.002111,-0.015601
Annual_income,0.078543,1.0,0.110682,-0.158966,0.051522,0.20797,0.208948,0.043012,-0.145588,0.051633,...,-0.02387,-0.040497,0.274205,-0.052358,0.008767,0.00367,-0.03081,-0.019408,-0.010724,-0.034754
Birthday_count,0.277944,0.110682,1.0,-0.614271,0.265066,0.182646,0.144234,-0.124409,-0.608547,0.060405,...,0.131486,0.001981,0.1034,-0.015216,0.027483,0.04156,0.141982,0.002523,-0.042292,0.041076
Employed_days,-0.219095,-0.158966,-0.614271,1.0,-0.238705,-0.176383,-0.150658,0.100686,0.981839,-0.131034,...,-0.206216,-0.03402,-0.13987,-0.085226,-0.047977,-0.016329,-0.129966,-0.034856,-0.056413,-0.025315
Family_Members,0.890248,0.051522,0.265066,-0.238705,1.0,0.099406,0.119851,-0.004458,-0.232919,0.042123,...,0.044543,0.004901,0.055483,0.045989,0.001665,0.031831,0.000752,-0.004067,0.005206,-0.009703
GENDER_M,0.063068,0.20797,0.182646,-0.176383,0.099406,1.0,0.366257,-0.038264,-0.172277,-0.01814,...,0.236158,0.047559,0.090425,-0.131507,-0.067362,-0.027382,-0.138115,-0.058219,0.093858,-0.043337
Car_Owner_Y,0.063467,0.208948,0.144234,-0.150658,0.119851,0.366257,1.0,0.002401,-0.147484,0.011206,...,0.062545,-0.045521,0.093873,-0.016053,-0.023412,-0.029557,-0.074187,-0.045521,-0.032153,-0.02357
Propert_Owner_Y,-0.003352,0.043012,-0.124409,0.100686,-0.004458,-0.038264,0.002401,1.0,0.098419,-0.034443,...,-0.024589,0.020126,0.030028,-0.035471,0.011823,-0.011515,-0.023157,0.020126,0.018173,-0.006271
Type_Income_Pensioner,-0.217467,-0.145588,-0.608547,0.981839,-0.232919,-0.172277,-0.147484,0.098419,1.0,-0.130526,...,-0.209847,-0.012646,-0.136306,-0.074143,-0.048326,-0.016495,-0.134141,-0.035071,-0.058757,-0.026106
Type_Income_State servant,0.035128,0.051633,0.060405,-0.131034,0.042123,-0.01814,0.011206,-0.034443,-0.130526,1.0,...,-0.071869,-0.021765,0.033012,0.183929,0.017094,-0.010237,-0.083249,0.04278,-0.036465,-0.016202


In [10]:
# 2. Univariate Linear Regression to predict Birthday_count from Employed_days
credit_df_1= credit_df[['Employed_days','Birthday_count']]


X = np.array(credit_df_1['Employed_days']).reshape(-1,1)

y = np.array(credit_df_1['Birthday_count']).reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 

regression_model = LinearRegression()

regression_model.fit(X_train, y_train)

y_pred = regression_model.predict(X_test)

accuracy_score = regression_model.score(X_test, y_test)
print(accuracy_score)


0.3910979159137805


2. Our model did not perform all that well with an accuracy of about 39.1%. Using linear regression is probably not a good choice for this data set as we have lots of discrete data and only really 3 continous variables

In [None]:
# 3. function that selects the optimal k value
def optimal_k(X_train, y_train, X_test, y_test):
    k_values = range(1, 50)
    best_k = 1
    best_score = 0
    
    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        score = knn.score(X_test, y_test)
        
        if score > best_score:
            best_score = score
            best_k = k
            
    return best_k, best_score

   


In [16]:
# 3. Using KNN to predict if someone is a property owner
X = credit_df.drop('Propert_Owner_Y', axis=1)
y = credit_df['Propert_Owner_Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

knn = KNeighborsClassifier(n_neighbors=optimal_k(X_train, y_train, X_test, y_test)[0])

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

print(knn.score(X_test, y_test))

0.6344086021505376


Are model performed much better with with an accuracy of about 63.4% which is much better but still not that great

In [22]:
# 4. Logistic Regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=42,
                                                    stratify=y)
regression = LogisticRegression(random_state=42, max_iter=10000).fit(X_train, y_train)
y_predicted = regression.predict(X_test)
regression.score(X_test, y_test)

0.6903225806451613

This model performed the best of all of our models with an acurracy of about 69.0% which is better but still not super accurate.

In [23]:
# 5. Linear regression with normalization
credit_df_1= credit_df[['Employed_days','Birthday_count']]


X = np.array(credit_df_1['Employed_days']).reshape(-1,1)

y = np.array(credit_df_1['Birthday_count']).reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) 

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

regression_model = LinearRegression()

regression_model.fit(X_train, y_train)

y_pred = regression_model.predict(X_test)

accuracy_score = regression_model.score(X_test, y_test)
print(accuracy_score)

0.3912042876023383


In [24]:
#5. KNN with normalization
X = credit_df.drop('Propert_Owner_Y', axis=1)
y = credit_df['Propert_Owner_Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

knn = KNeighborsClassifier(n_neighbors=optimal_k(X_train, y_train, X_test, y_test)[0])

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

print(knn.score(X_test, y_test))

0.6731182795698925


In [29]:
# 5. Logistic regression with normalization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=42,
                                                  stratify=y)  

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

regression = LogisticRegression(random_state=42, max_iter=10000).fit(X_train, y_train)
y_predicted = regression.predict(X_test)
regression.score(X_test, y_test)


0.6817204301075269

When we normalized the data, it didn't change the accuracy with linear regression much at all, it improve the KNN accuracy and actually decreased the logisitic regression accuracy. Here it would probably be best to use accuracy since we care more about how correct the model is and not neccesarily about false postives or false negatives