## Importing Libraries 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Get the Data

In [2]:
%%time
file=('Income Train.xlsx')
df = pd.read_excel(file)

CPU times: total: 11.2 s
Wall time: 11.2 s


## Exploratory Data Analysis

### Check the shape of Dataset

In [3]:
%%time
df.shape

CPU times: total: 0 ns
Wall time: 0 ns


(36631, 15)

### Preview Dataset

In [4]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,18,Private,128538,11th,7,Never-married,Sales,Own-child,White,Female,0,0,25,United-States,<=50K
1,36,Private,112271,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0,1902,40,United-States,>50K.
2,35,Private,111387,Some-college,10,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,1579,40,United-States,<=50K
3,55,Private,118993,Some-college,10,Separated,Exec-managerial,Unmarried,White,Female,0,0,10,United-States,<=50K
4,35,?,317780,Some-college,10,Never-married,?,Unmarried,Black,Female,0,0,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36626,49,Local-gov,106554,Bachelors,13,Divorced,Prof-specialty,Unmarried,White,Female,0,0,40,United-States,>50K
36627,30,Private,231263,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,<=50K
36628,30,Private,197558,HS-grad,9,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,0,40,United-States,<=50K
36629,23,,234108,Bachelors,13,Never-married,,Not-in-family,White,Male,0,0,35,United-States,<=50K.


In [5]:
df['income'] = df['income'].str.strip('.')
df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,18,Private,128538,11th,7,Never-married,Sales,Own-child,White,Female,0,0,25,United-States,0
1,36,Private,112271,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0,1902,40,United-States,1
2,35,Private,111387,Some-college,10,Married-civ-spouse,Handlers-cleaners,Husband,White,Male,0,1579,40,United-States,0
3,55,Private,118993,Some-college,10,Separated,Exec-managerial,Unmarried,White,Female,0,0,10,United-States,0
4,35,?,317780,Some-college,10,Never-married,?,Unmarried,Black,Female,0,0,40,United-States,0


In [6]:
# Assuming you have loaded your DataFrame 'df' before this point

# Split the data into features (X) and target variable (y)
X = df.drop(['income'], axis=1)
y = df['income']

## Split the data into separate training and test set

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## Feature Engineering

### Encode categorical variables

In [8]:
from sklearn import preprocessing

categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
for feature in categorical_features:
    le = preprocessing.LabelEncoder()
    X_train[feature] = le.fit_transform(X_train[feature])
    X_test[feature] = le.fit_transform(X_test[feature])

## Feature Scaling

In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

## Logistic Regression with PCA

### Explained Variance Ratio

In [10]:
from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)
sum(((pca.explained_variance_ratio_)*100)[:13])

97.23197872363221

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36631 entries, 0 to 36630
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             36631 non-null  int64 
 1   workclass       35907 non-null  object
 2   fnlwgt          36631 non-null  int64 
 3   education       36631 non-null  object
 4   education-num   36631 non-null  int64 
 5   marital-status  36631 non-null  object
 6   occupation      35905 non-null  object
 7   relationship    36631 non-null  object
 8   race            36631 non-null  object
 9   sex             36631 non-null  object
 10  capital-gain    36631 non-null  int64 
 11  capital-loss    36631 non-null  int64 
 12  hours-per-week  36631 non-null  int64 
 13  native-country  36435 non-null  object
 14  income          36631 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 4.2+ MB


### Logistic Regression with 13 features

In [12]:
from sklearn.linear_model import LogisticRegression
x = df.drop(['income','native-country'], axis=1)
y = df['income']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)


categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex']
for feature in categorical_features:
    le = preprocessing.LabelEncoder()
    X_train[feature] = le.fit_transform(X_train[feature])
    X_test[feature] = le.fit_transform(X_test[feature])
    
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)


from sklearn.metrics import classification_report

classification_rep = classification_report(y_test, y_pred)

# Print the classification report
print(classification_rep)

              precision    recall  f1-score   support

           0       0.85      0.94      0.89      8371
           1       0.71      0.47      0.57      2619

    accuracy                           0.83     10990
   macro avg       0.78      0.70      0.73     10990
weighted avg       0.82      0.83      0.81     10990



### Logistic Regression with 12 features

In [13]:
from sklearn.linear_model import LogisticRegression
x = df.drop(['income','native-country', 'hours-per-week'], axis=1)
y = df['income']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)


categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex']
for feature in categorical_features:
    le = preprocessing.LabelEncoder()
    X_train[feature] = le.fit_transform(X_train[feature])
    X_test[feature] = le.fit_transform(X_test[feature])
    
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)


from sklearn.metrics import classification_report

classification_rep = classification_report(y_test, y_pred)

# Print the classification report
print(classification_rep)

              precision    recall  f1-score   support

           0       0.85      0.94      0.89      8371
           1       0.71      0.46      0.56      2619

    accuracy                           0.83     10990
   macro avg       0.78      0.70      0.72     10990
weighted avg       0.81      0.83      0.81     10990



### Logistic Regression with 11 features

In [14]:
from sklearn.linear_model import LogisticRegression
x = df.drop(['income','native-country','hours-per-week','capital-loss'], axis=1)
y = df['income']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)


categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex']
for feature in categorical_features:
    le = preprocessing.LabelEncoder()
    X_train[feature] = le.fit_transform(X_train[feature])
    X_test[feature] = le.fit_transform(X_test[feature])
    
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)


from sklearn.metrics import classification_report

classification_rep = classification_report(y_test, y_pred)

# Print the classification report
print(classification_rep)

              precision    recall  f1-score   support

           0       0.84      0.94      0.89      8371
           1       0.71      0.44      0.55      2619

    accuracy                           0.82     10990
   macro avg       0.78      0.69      0.72     10990
weighted avg       0.81      0.82      0.81     10990



## Best Feature

In [16]:
from sklearn.linear_model import LogisticRegression
x = df.drop(['income'], axis=1)
y = df['income']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)


categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
for feature in categorical_features:
    le = preprocessing.LabelEncoder()
    X_train[feature] = le.fit_transform(X_train[feature])
    X_test[feature] = le.fit_transform(X_test[feature])
    
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)

pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
dim = np.argmax(cumsum == 0.90) * 1
print('The number of dimensions required to preserve 90% of variance is: ',dim)

The number of dimensions required to preserve 90% of variance is:  0


In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA

# Assuming you have loaded your DataFrame 'df' before this point

# Split the data into features (X) and target variable (y)
X = df.drop(['income'], axis=1)
y = df['income']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Encode categorical features using LabelEncoder
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
for feature in categorical_features:
    le = LabelEncoder()
    X_train[feature] = le.fit_transform(X_train[feature])

# Standardize the numerical features using StandardScaler
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

# Apply PCA
pca = PCA()
pca.fit(X_train_scaled)

# Get the explained variance ratio for each principal component
explained_variance_ratio = pca.explained_variance_ratio_

# Create a DataFrame with column names and their explained variance
variance_df = pd.DataFrame({'Feature': X_train_scaled.columns, 'Explained Variance': explained_variance_ratio})

# Print the DataFrame


variance_df['Explained Variance']=round(variance_df['Explained Variance']*100,2)


variance_df=variance_df.sort_values(['Explained Variance'],ascending=False)


variance_df['Cumulative Variance'] = np.cumsum(variance_df['Explained Variance'])


variance_df

Unnamed: 0,Feature,Explained Variance,Cumulative Variance
0,age,14.81,14.81
1,workclass,10.13,24.94
2,fnlwgt,9.24,34.18
3,education,8.09,42.27
4,education-num,7.84,50.11
5,marital-status,7.36,57.47
6,occupation,6.78,64.25
7,relationship,6.62,70.87
8,race,6.16,77.03
9,sex,6.05,83.08
