<a href="https://colab.research.google.com/github/Desmyk/MACHINELEARNING/blob/main/ADULTdata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

# metadata
print(adult.metadata)

# variable information
print(adult.variables)


{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Aug 07 2023', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAG

In [1]:
# necessary lib
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Load the dataset
url = "https://archive.ics.uci.edu/static/public/2/data.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
# Preprocess the data
le = LabelEncoder()
df['workclass'] = le.fit_transform(df['workclass'])
df['education'] = le.fit_transform(df['education'])
df['marital-status'] = le.fit_transform(df['marital-status'])
df['occupation'] = le.fit_transform(df['occupation'])
df['relationship'] = le.fit_transform(df['relationship'])
df['race'] = le.fit_transform(df['race'])
df['sex'] = le.fit_transform(df['sex'])
df['native-country'] = le.fit_transform(df['native-country'])
df['income'] = le.fit_transform(df['income'])


In [4]:
# Drop missing values
df.dropna(inplace=True)

In [5]:
# Define the features and target
X = df.drop('income', axis=1)
y = df['income']

In [6]:
# KNeighbors Classifier
knn_clf = KNeighborsClassifier(n_neighbors=63)
knn_scores = cross_val_score(knn_clf, X, y, cv=5)
print("KNeighbors Classifier Cross-Validation Scores:", knn_scores)
print("KNeighbors Classifier Mean Accuracy:", np.mean(knn_scores))

KNeighbors Classifier Cross-Validation Scores: [0.53147712 0.52973692 0.5295864  0.53112203 0.52682228]
KNeighbors Classifier Mean Accuracy: 0.5297489513913918


In [7]:
# Train the KNeighbors Classifier on the entire dataset
knn_clf.fit(X, y)
y_pred_knn = knn_clf.predict(X)
print("KNeighbors Classifier Classification Report:")
print(classification_report(y, y_pred_knn))
print("KNeighbors Classifier Confusion Matrix:")
print(confusion_matrix(y, y_pred_knn))

KNeighbors Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.99      0.69     24720
           1       0.40      0.01      0.02     12435
           2       0.63      0.19      0.29      7841
           3       0.50      0.00      0.00      3846

    accuracy                           0.53     48842
   macro avg       0.52      0.30      0.25     48842
weighted avg       0.51      0.53      0.40     48842

KNeighbors Classifier Confusion Matrix:
[[24472   111   137     0]
 [12236   120    78     1]
 [ 6323    47  1468     3]
 [ 3162    19   661     4]]


In [8]:
# Support Vector Classifier
svm_clf = SVC(kernel='rbf', C=10, gamma=0.05)
svm_scores = cross_val_score(svm_clf, X, y, cv=5)
print("Support Vector Classifier Cross-Validation Scores:", svm_scores)
print("Support Vector Classifier Mean Accuracy:", np.mean(svm_scores))

Support Vector Classifier Cross-Validation Scores: [0.50076773 0.50455523 0.504095   0.50102375 0.50255938]
Support Vector Classifier Mean Accuracy: 0.5026002186126048


In [12]:
# Train the Support Vector Classifier on the entire dataset
svm_clf.fit(X, y)
y_pred_svm = svm_clf.predict(X)
print("Support Vector Classifier Classification Report:")
print(classification_report(y, y_pred_svm))
print("Support Vector Classifier Confusion Matrix:")
print(confusion_matrix(y, y_pred_svm))

Support Vector Classifier Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     24720
           1       1.00      1.00      1.00     12435
           2       1.00      1.00      1.00      7841
           3       1.00      0.99      1.00      3846

    accuracy                           1.00     48842
   macro avg       1.00      1.00      1.00     48842
weighted avg       1.00      1.00      1.00     48842

Support Vector Classifier Confusion Matrix:
[[24701    15     2     2]
 [   47 12385     3     0]
 [   15     3  7821     2]
 [   11     3    10  3822]]
