In [4]:
import numpy as np
from matplotlib import pyplot as plt 
import pandas as pd

In [5]:
consumer_analysis=pd.read_csv("Consumer_Dataset.csv")
consumer_test_data=pd.read_csv("Consumer Test Dataset.csv")

***
The following cell drops the rows which are duplicate or contains NA values. This is done to make sure that these rows doesn't hinder with the learning of the model.

In [6]:
consumer_analysis=consumer_analysis.dropna().reset_index(drop=True)
consumer_analysis.drop_duplicates(inplace=True)
print(consumer_analysis.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6665 entries, 0 to 6664
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           6665 non-null   int64  
 1   Gender               6665 non-null   object 
 2   Age                  6665 non-null   int64  
 3   Ever_Married         6665 non-null   object 
 4   Family_Size          6665 non-null   float64
 5   Profession           6665 non-null   object 
 6   Graduated            6665 non-null   object 
 7   Work_Experience      6665 non-null   float64
 8   Energy_Consumption   6665 non-null   object 
 9   Preferred_Renewable  6665 non-null   object 
 10  Group                6665 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 572.9+ KB
None


***
The following cell contains the feature vector creation. The non numeric values are change to 0 or 1 by making a new row for each non-numeric value.

In [7]:
feature=[ "Male" , "Female", "Age", "Ever_Married", "Family_Size"]

for i in consumer_analysis["Profession"]:
    if i in feature:
        continue
    else:
        feature.append(i)
feature.extend(["Graduated", "Work_Experience"])
for i in consumer_analysis["Energy_Consumption"]:
    if i in feature:
        continue
    else:
        feature.append(i)
for i in consumer_analysis["Preferred_Renewable"]:
    if i in feature:
        continue
    else:
        feature.append(i)
feature_vector=pd.DataFrame(feature)
feature_vector.rename(columns={0:"Features"}, inplace=True)
feature_vector["Values"]=0



***
In the following cell I have made the function for obtaining the feature vector and the group label for each data in the dataset.

In [8]:

def phix(i):
    phi=feature_vector.copy()
    if(consumer_analysis.loc[i, "Gender"]=="Male"):
        phi.loc[0,"Values"]=1
    else:
        phi.loc[1,"Values"]=1
    phi.loc[2,"Values"]=consumer_analysis.loc[i,"Age"]
    if(consumer_analysis.loc[i, "Ever_Married"]=="Yes"):
        phi.loc[3,"Values"]=1
    phi.loc[4,"Values"]=consumer_analysis.loc[i,"Family_Size"]
    for j in range(5,14):
        if(phi.loc[j,"Features"]==consumer_analysis.loc[i,"Profession"]):
            phi.loc[j,"Values"]=1
    if(consumer_analysis.loc[i, "Graduated"]=="Yes"):
        phi.loc[14,"Values"]=1
    phi.loc[15, "Values"]=consumer_analysis.loc[i,"Work_Experience"]
    for j in range(16,19):
        if(phi.loc[j,"Features"]==consumer_analysis.loc[i,"Energy_Consumption"]):
            phi.loc[j,"Values"]=1
    for j in range(19,26):
        if(phi.loc[j,"Features"]==consumer_analysis.loc[i,"Preferred_Renewable"]):
            phi.loc[j,"Values"]=1
    phi1=phi["Values"].values
    return phi1
def one_hot_encode(group):
    mapping = {"A": 0, "B": 1, "C": 2, "D": 3}
    encoded = np.zeros(4)
    encoded[mapping[group]] = 1
    return encoded


    
    
    
    

***
Initialization of weight vector

In [9]:

size = len(feature_vector)
num_classes = 4
weight = np.zeros((num_classes, size))

***
Stochastic Gradient Descent using the first 5000 data, leaving the rest for cross validation.

In [10]:
for k in range(1,11):
    for i in consumer_analysis.head(5000).index:
        y_i = one_hot_encode(consumer_analysis.loc[i, "Group"])
        score = np.dot(weight, phix(i))
        exp_score = np.exp(score - np.max(score))  
        probabilities = exp_score / np.sum(exp_score)
        error = probabilities - y_i
        grad = np.outer(error, phix(i))
        weight=weight - grad*0.1/(k)

        

***
Making a list of assigned group

In [11]:
predictions=[]
group_labels=["A","B","C","D"]
for i in consumer_analysis.tail(1000).index:
    score = np.dot(weight, phix(i))
    exp_score = np.exp(score - np.max(score))  
    probabilities = exp_score / np.sum(exp_score)
    assigned_class_index = np.argmax(probabilities)
    assigned_group = group_labels[assigned_class_index]
    predictions.append(assigned_group)

***
Following cells contain code for checking the accuracy and precision of the model

In [12]:
predicted_numerical_labels = [group_labels.index(label) for label in predictions]
true_numerical_labels = [group_labels.index(label) for label in consumer_analysis.tail(1000)["Group"]]


In [13]:
from sklearn.metrics import accuracy_score, precision_score
accuracy = accuracy_score(true_numerical_labels, predicted_numerical_labels)
precision = precision_score(true_numerical_labels, predicted_numerical_labels, average='weighted')
print(accuracy)
print(precision)


0.409
0.39512777203301824


***
Assigning Group label to test data

In [22]:
consumer_test_data["Group"]="None"
for i in consumer_test_data.index:
    score = np.dot(weight, phix(i))
    exp_score = np.exp(score - np.max(score))  
    probabilities = exp_score / np.sum(exp_score)
    assigned_class_index = np.argmax(probabilities)
    assigned_group = group_labels[assigned_class_index]
    consumer_test_data.loc[i,"Group"]=assigned_group