# ASSIGNMENT

1. [Load the dataset using pandas.](#1)
2. [Handle missing values appropriately.](#2)
3. [Perform feature engineering (e.g., creating new features from existing ones, encoding categorical variables).](#3)
4. [Implement custom Python functions for the following tasks.](#4)
    * [Calculate the mean and median age for each occupation.](#5)
    * [Create a new column that categorizes income into 'Low', 'Medium', and 'High' based on predefined thresholds.](#6)
5. [Scale the numerical features.](#7)
6. [Implement a sophisticated classification model (e.g., Random Forest, Gradient Boosting).](#8)
7. [Perform hyperparameter tuning using GridSearchCV or RandomizedSearchCV.](#9)
8. [Evaluate the model using cross-validation and appropriate metrics (e.g. accuracy, F1-score, ROC-AUC).](#10)

In [89]:
import numpy as np
import pandas as pd
import random
import joblib
from faker import Faker
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Load the dataset using pandas<a id=1></a>

In [2]:
df = pd.read_csv("generated_dataset.csv")

In [3]:
df.head()

Unnamed: 0,Name,Age,Gender,Occupation,Income,Purchased
0,John,28.0,Female,Lawyer,110644.0,No
1,Jennifer,25.0,Male,Engineer,76147.0,No
2,Emily,46.0,Male,,86747.0,Yes
3,James,43.0,Male,Doctor,93792.0,No
4,Deborah,31.0,Female,Nurse,67400.0,No


### Handle missing values appropriately <a id=2></a>

In [4]:
df.isnull().sum()

Name          500
Age           500
Gender        500
Occupation    500
Income        500
Purchased       0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        4500 non-null   object 
 1   Age         4500 non-null   float64
 2   Gender      4500 non-null   object 
 3   Occupation  4500 non-null   object 
 4   Income      4500 non-null   float64
 5   Purchased   5000 non-null   object 
dtypes: float64(2), object(4)
memory usage: 234.5+ KB


In [6]:
data = df.drop(["Name"], axis =1)

In [7]:
data

Unnamed: 0,Age,Gender,Occupation,Income,Purchased
0,28.0,Female,Lawyer,110644.0,No
1,25.0,Male,Engineer,76147.0,No
2,46.0,Male,,86747.0,Yes
3,43.0,Male,Doctor,93792.0,No
4,31.0,Female,Nurse,67400.0,No
...,...,...,...,...,...
4995,39.0,Female,Teacher,40296.0,No
4996,42.0,Male,Artist,56282.0,Yes
4997,39.0,Male,,,Yes
4998,42.0,Female,Artist,47805.0,No


In [8]:
numeric_var = ['Age','Income']
categoric_var = ['Gender','Occupation']

In [9]:
def col_unique_value(data):
    for col in data.columns:
        print(col, ":", data[col].unique())

In [10]:
col_unique_value(data)

Age : [28. 25. 46. 43. 31. 53. 50. 42. 26. 27. 52. 45. 47. 23. 57. 38. 48. 36.
 24. 49. 40. nan 29. 58. 34. 35. 39. 59. 22. 56. 51. 41. 32. 44. 37. 33.
 30. 60. 55. 54.]
Gender : ['Female' 'Male' nan]
Occupation : ['Lawyer' 'Engineer' nan 'Doctor' 'Nurse' 'Artist' 'Teacher']
Income : [110644.  76147.  86747. ...  56282.  47805. 100365.]
Purchased : ['No' 'Yes']


In [11]:
for i in numeric_var:
    data[i]=data[i].fillna(data[i].median())

In [12]:
col_unique_value(data)

Age : [28. 25. 46. 43. 31. 53. 50. 42. 26. 27. 52. 45. 47. 23. 57. 38. 48. 36.
 24. 49. 40. 41. 29. 58. 34. 35. 39. 59. 22. 56. 51. 32. 44. 37. 33. 30.
 60. 55. 54.]
Gender : ['Female' 'Male' nan]
Occupation : ['Lawyer' 'Engineer' nan 'Doctor' 'Nurse' 'Artist' 'Teacher']
Income : [110644.  76147.  86747. ...  56282.  47805. 100365.]
Purchased : ['No' 'Yes']


In [13]:
for i in categoric_var:
    mode_value = data[i].mode()[0]
    data[i] = data[i].fillna(mode_value)

In [14]:
col_unique_value(data)

Age : [28. 25. 46. 43. 31. 53. 50. 42. 26. 27. 52. 45. 47. 23. 57. 38. 48. 36.
 24. 49. 40. 41. 29. 58. 34. 35. 39. 59. 22. 56. 51. 32. 44. 37. 33. 30.
 60. 55. 54.]
Gender : ['Female' 'Male']
Occupation : ['Lawyer' 'Engineer' 'Doctor' 'Nurse' 'Artist' 'Teacher']
Income : [110644.  76147.  86747. ...  56282.  47805. 100365.]
Purchased : ['No' 'Yes']


### Perform feature engineering <a id=3><a/>

In [15]:
bins = [0, 30, 40, 50, 60, np.inf]
labels = ['<30', '30-39', '40-49', '50-59', '60+']
data['Age_Group'] = pd.cut(data['Age'], bins=bins, labels=labels, right=False)
print(data)

       Age  Gender Occupation    Income Purchased Age_Group
0     28.0  Female     Lawyer  110644.0        No       <30
1     25.0    Male   Engineer   76147.0        No       <30
2     46.0    Male   Engineer   86747.0       Yes     40-49
3     43.0    Male     Doctor   93792.0        No     40-49
4     31.0  Female      Nurse   67400.0        No     30-39
...    ...     ...        ...       ...       ...       ...
4995  39.0  Female    Teacher   40296.0        No     30-39
4996  42.0    Male     Artist   56282.0       Yes     40-49
4997  39.0    Male   Engineer   71714.5       Yes     30-39
4998  42.0  Female     Artist   47805.0        No     40-49
4999  46.0    Male     Lawyer  100365.0       Yes     40-49

[5000 rows x 6 columns]


In [16]:
col_unique_value(data)

Age : [28. 25. 46. 43. 31. 53. 50. 42. 26. 27. 52. 45. 47. 23. 57. 38. 48. 36.
 24. 49. 40. 41. 29. 58. 34. 35. 39. 59. 22. 56. 51. 32. 44. 37. 33. 30.
 60. 55. 54.]
Gender : ['Female' 'Male']
Occupation : ['Lawyer' 'Engineer' 'Doctor' 'Nurse' 'Artist' 'Teacher']
Income : [110644.  76147.  86747. ...  56282.  47805. 100365.]
Purchased : ['No' 'Yes']
Age_Group : ['<30', '40-49', '30-39', '50-59', '60+']
Categories (5, object): ['<30' < '30-39' < '40-49' < '50-59' < '60+']


In [17]:
data

Unnamed: 0,Age,Gender,Occupation,Income,Purchased,Age_Group
0,28.0,Female,Lawyer,110644.0,No,<30
1,25.0,Male,Engineer,76147.0,No,<30
2,46.0,Male,Engineer,86747.0,Yes,40-49
3,43.0,Male,Doctor,93792.0,No,40-49
4,31.0,Female,Nurse,67400.0,No,30-39
...,...,...,...,...,...,...
4995,39.0,Female,Teacher,40296.0,No,30-39
4996,42.0,Male,Artist,56282.0,Yes,40-49
4997,39.0,Male,Engineer,71714.5,Yes,30-39
4998,42.0,Female,Artist,47805.0,No,40-49


In [18]:
data = pd.get_dummies(data, columns=['Gender', 'Occupation', 'Age_Group'], drop_first=False)
data["Purchased"].replace({"Yes":1,"No":0},inplace=True)

In [19]:
col_unique_value(data)

Age : [28. 25. 46. 43. 31. 53. 50. 42. 26. 27. 52. 45. 47. 23. 57. 38. 48. 36.
 24. 49. 40. 41. 29. 58. 34. 35. 39. 59. 22. 56. 51. 32. 44. 37. 33. 30.
 60. 55. 54.]
Income : [110644.  76147.  86747. ...  56282.  47805. 100365.]
Purchased : [0 1]
Gender_Female : [1 0]
Gender_Male : [0 1]
Occupation_Artist : [0 1]
Occupation_Doctor : [0 1]
Occupation_Engineer : [0 1]
Occupation_Lawyer : [1 0]
Occupation_Nurse : [0 1]
Occupation_Teacher : [0 1]
Age_Group_<30 : [1 0]
Age_Group_30-39 : [0 1]
Age_Group_40-49 : [0 1]
Age_Group_50-59 : [0 1]
Age_Group_60+ : [0 1]


In [20]:
data

Unnamed: 0,Age,Income,Purchased,Gender_Female,Gender_Male,Occupation_Artist,Occupation_Doctor,Occupation_Engineer,Occupation_Lawyer,Occupation_Nurse,Occupation_Teacher,Age_Group_<30,Age_Group_30-39,Age_Group_40-49,Age_Group_50-59,Age_Group_60+
0,28.0,110644.0,0,1,0,0,0,0,1,0,0,1,0,0,0,0
1,25.0,76147.0,0,0,1,0,0,1,0,0,0,1,0,0,0,0
2,46.0,86747.0,1,0,1,0,0,1,0,0,0,0,0,1,0,0
3,43.0,93792.0,0,0,1,0,1,0,0,0,0,0,0,1,0,0
4,31.0,67400.0,0,1,0,0,0,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,39.0,40296.0,0,1,0,0,0,0,0,0,1,0,1,0,0,0
4996,42.0,56282.0,1,0,1,1,0,0,0,0,0,0,0,1,0,0
4997,39.0,71714.5,1,0,1,0,0,1,0,0,0,0,1,0,0,0
4998,42.0,47805.0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


### Implement custom Python functions for the following tasks: <a id=4></a>

#### --> Calculate the mean and median age for each occupation <a id=5></a>

In [21]:
mean_age = data.groupby(df['Occupation'])['Age'].mean()
median_age = data.groupby(df['Occupation'])['Age'].median()

In [22]:
age_by_occ = pd.DataFrame({'Mean_Age': mean_age, 'Median_Age': median_age})
age_by_occ

Unnamed: 0_level_0,Mean_Age,Median_Age
Occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
Artist,40.984085,41.0
Doctor,41.803763,41.0
Engineer,41.212548,41.0
Lawyer,41.091755,41.0
Nurse,40.987288,41.0
Teacher,41.240473,41.0


#### --> Create a new column that categorizes income into 'Low', 'Medium', and 'High' based on predefined thresholds <a id=6></a>

In [23]:
data['Income'].describe()

count      5000.000000
mean      75757.648400
std       25153.815114
min       35070.000000
25%       56836.000000
50%       71714.500000
75%       90844.750000
max      149914.000000
Name: Income, dtype: float64

In [24]:
bins = [35000,70000,100000,150000]
labels = [ 'Low','Medium', 'High']
data['Income_Category'] = pd.cut(data['Income'], bins=bins, labels=labels, right=False)

In [27]:
data = pd.get_dummies(data, columns = ['Income_Category'], drop_first=False)

In [28]:
data.head()

Unnamed: 0,Age,Income,Purchased,Gender_Female,Gender_Male,Occupation_Artist,Occupation_Doctor,Occupation_Engineer,Occupation_Lawyer,Occupation_Nurse,Occupation_Teacher,Age_Group_<30,Age_Group_30-39,Age_Group_40-49,Age_Group_50-59,Age_Group_60+,Income_Category_Low,Income_Category_Medium,Income_Category_High
0,28.0,110644.0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1
1,25.0,76147.0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0
2,46.0,86747.0,1,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0
3,43.0,93792.0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0
4,31.0,67400.0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0


### Scale the numerical features <a id=7></a>

In [29]:
numeric_var = ['Age','Income']

In [30]:
scaler = MinMaxScaler()

In [31]:
data[numeric_var] = scaler.fit_transform(data[numeric_var])

In [32]:
col_unique_value(data)

Age : [0.15789474 0.07894737 0.63157895 0.55263158 0.23684211 0.81578947
 0.73684211 0.52631579 0.10526316 0.13157895 0.78947368 0.60526316
 0.65789474 0.02631579 0.92105263 0.42105263 0.68421053 0.36842105
 0.05263158 0.71052632 0.47368421 0.5        0.18421053 0.94736842
 0.31578947 0.34210526 0.44736842 0.97368421 0.         0.89473684
 0.76315789 0.26315789 0.57894737 0.39473684 0.28947368 0.21052632
 1.         0.86842105 0.84210526]
Income : [0.65805789 0.3576765  0.44997562 ... 0.18470273 0.11088955 0.56855386]
Purchased : [0 1]
Gender_Female : [1 0]
Gender_Male : [0 1]
Occupation_Artist : [0 1]
Occupation_Doctor : [0 1]
Occupation_Engineer : [0 1]
Occupation_Lawyer : [1 0]
Occupation_Nurse : [0 1]
Occupation_Teacher : [0 1]
Age_Group_<30 : [1 0]
Age_Group_30-39 : [0 1]
Age_Group_40-49 : [0 1]
Age_Group_50-59 : [0 1]
Age_Group_60+ : [0 1]
Income_Category_Low : [0 1]
Income_Category_Medium : [0 1]
Income_Category_High : [1 0]


### Implement a sophisticated classification model <a id=8></a>

In [35]:
X = data.drop(["Purchased"], axis =1)
y = data["Purchased"]

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=42)

In [37]:
X_train.shape, y_train.shape , X_test.shape , y_test.shape

((3750, 18), (3750,), (1250, 18), (1250,))

### Perform hyperparameter tuning using GridSearchCV or RandomizedSearchCV <a id=9></a>

In [49]:
rf_grid = { 'n_estimators': [100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 
                 'max_features': ['sqrt', 'log2']}

In [54]:
grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rf_grid, cv=3, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)

Fitting 3 folds for each of 48 candidates, totalling 144 fits


In [56]:
best_rf = grid_search_rf.best_estimator_

In [58]:
best_rf.fit(X_train, y_train)

In [60]:
y_pred_rf = best_rf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Best Random Forest Accuracy: {accuracy_rf}")

Best Random Forest Accuracy: 0.5104


### Evaluate the model <a id=10></a>

In [64]:
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.50      0.60      0.54       608
           1       0.53      0.42      0.47       642

    accuracy                           0.51      1250
   macro avg       0.51      0.51      0.51      1250
weighted avg       0.51      0.51      0.51      1250

[[366 242]
 [370 272]]


In [90]:
cv_scores = cross_val_score(best_rf, X_train, y_train)
print("Cross-Validation", cv_scores)
print("Mean", np.mean(cv_scores))

Cross-Validation [0.49333333 0.44533333 0.50933333 0.49333333 0.51333333]
Mean 0.49093333333333333


In [65]:
joblib.dump(voting_clf, 'model.pkl')

['model.pkl']

### Accuracy on train data

In [66]:
y_train_pre = voting_clf.predict(X_train)
accuracy = accuracy_score(y_train, y_train_pre)
print(f" Accuracy on training data: {accuracy}")

 Accuracy on training data: 0.9685333333333334


In [74]:
import numpy as np
import pandas as pd
import random
import joblib
from faker import Faker
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def preprocess_input(input_data):
    df = pd.DataFrame([input_data])
    df = df.drop(["Name"], axis =1)

    age_bins = [0, 30, 40, 50, 60, np.inf]
    age_labels = ['<30', '30-39', '40-49', '50-59', '60+']
    df['Age_Group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)

    income_bins = [35000, 70000, 100000, 150000]
    income_labels = ['Low', 'Medium', 'High']
    df['Income_Category'] = pd.cut(df['Income'], bins=income_bins, labels=income_labels, right=False)

    df = pd.get_dummies(df, columns=['Gender', 'Occupation', 'Age_Group', 'Income_Category'], drop_first=False)
    
    expected_columns = [
        'Age', 'Income', 'Gender_Female', 'Gender_Male', 
        'Occupation_Artist', 'Occupation_Doctor', 'Occupation_Engineer',
        'Occupation_Lawyer', 'Occupation_Nurse', 'Occupation_Teacher', 
        'Age_Group_<30', 'Age_Group_30-39', 'Age_Group_40-49', 
        'Age_Group_50-59', 'Age_Group_60+', 'Income_Category_Low',
        'Income_Category_Medium', 'Income_Category_High'
    ]
    for col in expected_columns:
        if col not in df.columns:
            df[col] = 0
    df = df[expected_columns]
    
    return df

input_data = {'Name': 'Disha', 'Age': 35, 'Gender': 'Female', 'Occupation': 'Engineer', 'Income': 85000}
inp = preprocess_input(input_data)
inp

Unnamed: 0,Age,Income,Gender_Female,Gender_Male,Occupation_Artist,Occupation_Doctor,Occupation_Engineer,Occupation_Lawyer,Occupation_Nurse,Occupation_Teacher,Age_Group_<30,Age_Group_30-39,Age_Group_40-49,Age_Group_50-59,Age_Group_60+,Income_Category_Low,Income_Category_Medium,Income_Category_High
0,35,85000,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0


In [75]:
pred = best_rf.predict(inp)

In [76]:
pred

array([0], dtype=int64)

In [77]:
pred[0]

0

In [87]:
pred = best_rf.predict_proba(inp)
prob_0 = pred[0][0]
prob_1 = pred[0][1]
print("Probability of No:", prob_0)
print("Probability of Yes:", prob_1)

Probability of No: 0.5058445220724027
Probability of Yes: 0.49415547792759723
