In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
#Adding column names & converting the file to csv format
df = pd.read_csv("D:\\Cipher\\adult.data.txt",delimiter=',', names = ["Age" , "Workclass" , "ID" , "Education" ,
                                                                      "Education_num" , "Marital_Status", 
                                                                      "Occupation" , "Relationship" , "Race" , 
                                                                      "Sex" , "Capital_Gain" , "Capital_Loss" ,
                                                                      "Hrs_per_week" , "Country" , "Target" ])
df.to_csv('adult.csv')

In [None]:
#data pre-processing
df.isin(['?']).sum(axis=0)   
df = df.replace(to_replace = ' ?', value = np.nan)

#Checking for the number of missing values in each column
df.isnull().sum()  

#dropping all the nan rows
df.dropna(how='any',inplace=True)

In [None]:
#Checking the ratio of output variables
print(df['Target'].value_counts(sort=False)[1] / (df['Target'].value_counts(sort=False)[1] + df['Target'].value_counts(sort=False)[0]))
print(df['Target'].value_counts(sort=False)[0] / (df['Target'].value_counts(sort=False)[1] + df['Target'].value_counts(sort=False)[0]))

In [None]:
#Encoding the target variable, Since there are binary outputs    
label_encod = LabelEncoder()
df['Target'] = label_encod.fit_transform(df['Target'])

#Encoding the gender variable
label_encod = LabelEncoder()
df['Sex'] = label_encod.fit_transform(df['Sex'])

In [None]:
#Feature Selection
for column in df:
    print(df[column].value_counts())

In [None]:
#Plotting a bar graph to check and filter redundant variables from the dataset
df.groupby('Age').Target.mean().plot(kind='bar')

In [None]:
df.groupby('Workclass').Target.mean().plot(kind='bar')

In [None]:
df.groupby('Education').Target.mean().plot(kind='bar')

In [None]:
df.groupby('Education_num').Target.mean().plot(kind='bar')

In [None]:
df.groupby('Race').Target.mean().plot(kind='bar')

In [None]:
#Dropping all the redundant columns containing noisy data
df = df.drop(['ID', 'Occupation', 'Country' , 'Capital_Loss' , 'Capital_Gain' , 'Hrs_per_week' , 'Age'] , axis = 1)

In [None]:
#Converting categorical data to numerical data

df['Workclass'] = df['Workclass'].map({' Self-emp-inc': 0, ' State-gov': 1,' Federal-gov': 2, ' Without-pay': 3, ' Local-gov': 4,' Private': 5, ' Self-emp-not-inc': 6}).astype(int)

df['Race'] = df['Race'].map({' Black': 0, ' Asian-Pac-Islander': 1,' Other': 2, ' White': 3, ' Amer-Indian-Eskimo': 4}).astype(int)

df['Education'] = df['Education'].map({' Some-college': 0 , ' Preschool': 1 , ' 5th-6th': 2 , ' HS-grad': 3 , ' Masters': 4 , ' 12th': 5 , ' 7th-8th': 6, ' Prof-school': 7 ,' 1st-4th': 8, ' Assoc-acdm': 9, ' Doctorate': 10, ' 11th': 11 ,' Bachelors': 12, ' 10th': 13,' Assoc-voc': 14 ,' 9th': 15}).astype(int)

df['Relationship'] = df['Relationship'].map({' Not-in-family': 0 , ' Wife': 1 , ' Other-relative': 2, ' Unmarried': 3 ,' Husband': 4 ,' Own-child': 5}).astype(int)

df['Marital_Status'] = df['Marital_Status'].map({' Married-spouse-absent': 0 , ' Widowed': 1 , ' Married-civ-spouse': 2 , ' Separated': 3 , ' Divorced': 4 ,' Never-married': 5 , ' Married-AF-spouse': 6}).astype(int)


In [None]:
X = df.drop('Target' , axis = 1)
Y = df['Target']

In [None]:
#Checking the K best features from the dataset
bestfeatures = SelectKBest(score_func=chi2, k=5)
fit = bestfeatures.fit(X,Y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
scores = pd.concat([dfcolumns,dfscores],axis=1)
scores.columns = ['specs','score']
print(scores.nlargest(5,'score'))

In [None]:
#Splitting the dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, random_state = 0)

In [None]:
#Applying Logistic Regression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print("              Logistic Regression                ")
CM = confusion_matrix(y_test, y_pred)
CR = classification_report(y_test, y_pred)
AC = accuracy_score(y_test, y_pred)
print()
print("Confusion matrix:\n" , CM )
print()
sns.heatmap(CM, annot = True)
print("Accuracy Score:", (AC)*100)
print("Classification Report:\n", CR)

In [None]:
#Applying Naive Bayes
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Naive Bayes")
CM = confusion_matrix(y_test, y_pred)
CR = classification_report(y_test, y_pred)
AC = accuracy_score(y_test, y_pred)
print()
print("Confusion matrix:\n" , CM )
print()
sns.heatmap(CM, annot = True)
print("Accuracy Score:", (AC)*100)
print("Classification Report:\n", CR)

In [None]:
#Applying Random Forest Classifier & checking the optimal n_estimator which provides an accurate response
error_rate = []
for i in range(10,100,10):
    RFC = RandomForestClassifier(n_estimators = i, random_state = 0 )
    RFC.fit(X_train, y_train)
    pred = RFC.predict(X_test)
    error_rate.append(np.mean(pred != y_test))

#Plotting the range vs error_rate graph which specifies at what value, the error is minimum
plt.figure(figsize = (10,6))
plt.plot(range(10,100,10), error_rate, linestyle = 'dashed', color = 'blue', marker = 'o')
plt.title("Range vs Error rate")
plt.xlabel("Range")
plt.ylabel("Error Rate")
plt.show()

In [None]:
#Applying KNN & checking the optimal number of neighbours which gives a minimum error rate
error_rate = []
for k in range(1,40):
    knn = KNeighborsClassifier(n_neighbors = k, metric = 'minkowski', p = 2)
    knn.fit(X_train, y_train)
    pred = knn.predict(X_test)
    error_rate.append(np.mean(pred != y_test))


plt.figure(figsize = (10,6))
plt.plot(range(1,40), error_rate, linestyle = 'dashed', color = 'blue', marker = 'o')
plt.title("Epochs vs Error_rate")
plt.xlabel("Error_rate")
plt.ylabel("Epochs")
plt.show()