# Multi class Classification

<p>import data and verify multi class of target variable

In [1]:
import pandas as pd
cars = pd.read_csv("auto.csv")
unique_regions = cars['origin'].unique()
print(unique_regions)

[1 3 2]


<p>creating dummies for year,cylinders category columns

In [2]:
dummy_cylinders = pd.get_dummies(cars["cylinders"], prefix="cyl")
cars = pd.concat([cars, dummy_cylinders], axis=1)
dummy_years = pd.get_dummies(cars["year"], prefix="year")
cars = pd.concat([cars, dummy_years], axis=1)
cars = cars.drop(['year','cylinders'],axis=1)
print(cars.head())

    mpg  displacement  horsepower  weight  acceleration  origin  cyl_3  cyl_4  \
0  18.0         307.0       130.0  3504.0          12.0       1      0      0   
1  15.0         350.0       165.0  3693.0          11.5       1      0      0   
2  18.0         318.0       150.0  3436.0          11.0       1      0      0   
3  16.0         304.0       150.0  3433.0          12.0       1      0      0   
4  17.0         302.0       140.0  3449.0          10.5       1      0      0   

   cyl_5  cyl_6  ...  year_73  year_74  year_75  year_76  year_77  year_78  \
0      0      0  ...        0        0        0        0        0        0   
1      0      0  ...        0        0        0        0        0        0   
2      0      0  ...        0        0        0        0        0        0   
3      0      0  ...        0        0        0        0        0        0   
4      0      0  ...        0        0        0        0        0        0   

   year_79  year_80  year_81  year_82  
0   

<p>shuffling and splitting into train,test

In [4]:
import numpy as np

In [5]:
shuffled_rows = np.random.permutation(cars.index)
shuffled_cars = cars.iloc[shuffled_rows]
seventy_percent = int(shuffled_cars.shape[0] * 0.7)
train = shuffled_cars.iloc[0:seventy_percent]
test =  shuffled_cars.iloc[seventy_percent:]

# one-vs-all approach for multiclass classification problem

<p> As unique_regions are 3 , we make 3 binary classification models

In [6]:
from sklearn.linear_model import LogisticRegression

unique_origins = cars["origin"].unique()
unique_origins.sort()  
models = {}
features = [c for c in train.columns if c.startswith('year') or c.startswith('cyl')]
for origin in unique_origins:
    model = LogisticRegression()
    X_train=train[features]
    y_train=train['origin'] == origin
    model.fit(X_train,y_train)
    models[origin] = model

In [7]:
testing_probs = pd.DataFrame(columns=unique_origins)
  

for origin in unique_origins:
    # Select testing features.
    X_test = test[features]   
    # Compute probability of observation being in the origin.
    testing_probs[origin] = models[origin].predict_proba(X_test)[:,1]

In [8]:
predicted_origins = testing_probs.idxmax(axis=1)
print(predicted_origins)

0      1
1      1
2      2
3      1
4      1
      ..
113    1
114    1
115    1
116    1
117    1
Length: 118, dtype: int64


In [9]:
cars['predicted_origins'] = predicted_origins

In [15]:
matches = cars['origin'] == cars['predicted_origins']
print(cars[matches].shape[0])

67


In [16]:
print(len(cars))

392


In [12]:
accuracy = len(cars[matches]) / len(cars)

In [13]:
print(accuracy)

0.17091836734693877
