In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import xgboost as xgb
import joblib
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Read the column names and unique values from the 'age.names' file
column_names_file = r'Dataset\adult.names'
with open(column_names_file, 'r') as f:
    lines = f.readlines()

# Extract the attribute names from the lines
attribute_names = []
for line in lines[1:]:
    line = line.strip()
    if ':' in line:
        attribute_name = line.split(':')[0].strip()
        attribute_names.append(attribute_name)

In [3]:
data = pd.read_csv(r'Dataset\adult.data', header=None, names=attribute_names)
# Select the relevant features and the target variable
features = ['age', 'workclass', 'education', 'occupation', 'race', 'sex', 'hours-per-week', 'native-country']
target = '>50k'


X = data[features]
y = data[target]

In [4]:
print(X.head())

   age          workclass   education          occupation    race      sex  \
0   39          State-gov   Bachelors        Adm-clerical   White     Male   
1   50   Self-emp-not-inc   Bachelors     Exec-managerial   White     Male   
2   38            Private     HS-grad   Handlers-cleaners   White     Male   
3   53            Private        11th   Handlers-cleaners   Black     Male   
4   28            Private   Bachelors      Prof-specialty   Black   Female   

   hours-per-week  native-country  
0              40   United-States  
1              13   United-States  
2              40   United-States  
3              40   United-States  
4              40            Cuba  


In [5]:
label_encoder = LabelEncoder()
for col in range(X.shape[1]):
    if X.iloc[:, col].dtype == object:
        X.iloc[:, col] = label_encoder.fit_transform(X.iloc[:, col])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [6]:
scaler = StandardScaler()
numerical_features = ['age', 'hours-per-week']
X.loc[:, numerical_features] = scaler.fit_transform(X.loc[:, numerical_features].values)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())


In [7]:
print(X.head())

        age  workclass  education  occupation  race  sex  hours-per-week  \
0  0.030671          7          9           1     4    1       -0.035429   
1  0.837109          6          9           4     4    1       -2.222153   
2 -0.042642          4         11           6     4    1       -0.035429   
3  1.057047          4          1           6     2    1       -0.035429   
4 -0.775768          4          9          10     2    0       -0.035429   

   native-country  
0              39  
1              39  
2              39  
3              39  
4               5  


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Train the models
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
accuracy = model.score(X_test, y_test)

# Random Forest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

# Gradient Boosting
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)

# Support Vector Machines
svm = SVC()
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)

# Neural Networks
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
mlp_pred = mlp.predict(X_test)
mlp_accuracy = accuracy_score(y_test, mlp_pred)


# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_pred)

# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_pred)








In [10]:
# print the accuracies of the model
print('XGBoost accuracy: ', accuracy)
print('Random Forest accuracy: ', rf_accuracy)
print('Gradient Boosting accuracy: ', gb_accuracy)
print('Support Vector Machines accuracy: ', svm_accuracy)
print('Neural Networks accuracy: ', mlp_accuracy)
print('Logistic Regression accuracy: ', lr_accuracy)
print('Naive Bayes accuracy: ', nb_accuracy)


XGBoost accuracy:  0.8168278826961461
Random Forest accuracy:  0.7919545524335944
Gradient Boosting accuracy:  0.8215875940426839
Support Vector Machines accuracy:  0.7587901120835252
Neural Networks accuracy:  0.804544756640565
Logistic Regression accuracy:  0.7595578074619991
Naive Bayes accuracy:  0.7623215108245048


In [11]:
# Save the models
joblib.dump(rf, "models/random_forest_model.joblib")
joblib.dump(gb, "models/gradient_boosting_model.joblib")
joblib.dump(svm, "models/svm_model.joblib")
joblib.dump(mlp, "models/neural_network_model.joblib")
joblib.dump(lr, "models/logistic_regression_model.joblib")
joblib.dump(nb, "models/naive_bayes_model.joblib")

['models/naive_bayes_model.joblib']