In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
database = pd.read_excel("Company.xlsx")
database.head(3)

Unnamed: 0,id,name,role,salary,age,gender,alive,department,alone,phonenumber,city,state,single,education,loanstatus,Isadmin,investement,yearloadbill
0,1.0,Nathan Bolton,Manager,43056.0,45.0,Other,No,Support,No,4579240482,Jordanbury,Wisconsin,No,High School,Rejected,True,17317.0,50907.0
1,2.0,Andrew Doyle,Developer,79400.0,60.0,Other,Yes,IT,No,6881176726,Port Markfort,Texas,No,PhD,Approved,False,2091.0,106120.0
2,3.0,Rebecca Wiley,Designer,49856.0,21.0,Other,No,Finance,Yes,+1-820-049-2423,New Veronica,Alabama,Yes,Bachelors,Pending,True,49187.0,88174.0


In [3]:
database.isnull().sum()

id               8
name            14
role            10
salary          10
age             10
gender          11
alive           10
department       8
alone           10
phonenumber      9
city            13
state            6
single           2
education        6
loanstatus      11
Isadmin          4
investement      9
yearloadbill     8
dtype: int64

In [4]:
# Fill missing values for numerical columns with mean
num_cols = database.select_dtypes(include=['float64', 'int64']).columns
database[num_cols] = database[num_cols].fillna(database[num_cols].mean())

# Fill missing values for categorical columns with mode
cat_cols = database.select_dtypes(include=['object']).columns
for col in cat_cols:
    database[col] = database[col].fillna(database[col].mode()[0])

  database[col] = database[col].fillna(database[col].mode()[0])


In [5]:
database.isnull().sum()

id              0
name            0
role            0
salary          0
age             0
gender          0
alive           0
department      0
alone           0
phonenumber     0
city            0
state           0
single          0
education       0
loanstatus      0
Isadmin         0
investement     0
yearloadbill    0
dtype: int64

In [6]:
database.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
495    False
496    False
497    False
498    False
499    False
Length: 500, dtype: bool

In [7]:
database.shape

(500, 18)

In [8]:
database.dtypes

id              float64
name             object
role             object
salary          float64
age             float64
gender           object
alive            object
department       object
alone            object
phonenumber      object
city             object
state            object
single           object
education        object
loanstatus       object
Isadmin            bool
investement     float64
yearloadbill    float64
dtype: object

In [9]:
# Drop duplicate rows
database = database.drop_duplicates()

In [10]:
# Drop rows with any missing values
database = database.dropna()

In [11]:
database.head(3)

Unnamed: 0,id,name,role,salary,age,gender,alive,department,alone,phonenumber,city,state,single,education,loanstatus,Isadmin,investement,yearloadbill
0,1.0,Nathan Bolton,Manager,43056.0,45.0,Other,No,Support,No,4579240482,Jordanbury,Wisconsin,No,High School,Rejected,True,17317.0,50907.0
1,2.0,Andrew Doyle,Developer,79400.0,60.0,Other,Yes,IT,No,6881176726,Port Markfort,Texas,No,PhD,Approved,False,2091.0,106120.0
2,3.0,Rebecca Wiley,Designer,49856.0,21.0,Other,No,Finance,Yes,+1-820-049-2423,New Veronica,Alabama,Yes,Bachelors,Pending,True,49187.0,88174.0


In [12]:
dupli : int = database.duplicated().sum()
print("Number of duplicate rows:", dupli)

Number of duplicate rows: 0


In [13]:
# Replace spaces with underscores in the name column
database["name"] = database["name"].str.strip().str.replace(" ", "_")

In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [17]:
x = database.iloc[:, :-1].values
y = database["yearloadbill"].values

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
sc = StandardScaler()
x_train_scaled = sc.fit_transform(x_train)
x_test_scaled = sc.transform(x_test)    

In [None]:
log_reg = LogisticRegression()
log_reg.fit(x_train_scaled, y_train)

In [None]:
y_predict = log_reg.predict(x_test_scaled)

In [None]:
roc_auc_score(y_test, y_predict)

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=95)

In [None]:
x_train_pca = pca.fit_transform(x_train_scaled)
x_test_pca = pca.transform(x_test_scaled)

In [None]:
log_reg2 = LogisticRegression()
log_reg2.fit(x_train_pca, y_train)

In [None]:
y_predict2 = log_reg2.predict(x_test_pca)

In [None]:
roc_auc_score(y_test, y_predict2)