In [1]:
import pandas as pd
cars = pd.read_csv("auto.csv")

unique_regions = cars['origin'].unique()
unique_regions

array([1, 3, 2])

In [2]:
cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [3]:
dummy_cylinders = pd.get_dummies(cars["cylinders"], prefix="cyl")
cars = pd.concat([cars, dummy_cylinders], axis=1)

dummy_years = pd.get_dummies(cars['year'], prefix='year')
dummy_years.head()
cars = pd.concat([cars, dummy_years], axis=1)

cars = cars.drop(['year','cylinders'], axis=1)
cars.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cyl_3,cyl_4,cyl_5,cyl_6,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
0,18.0,307.0,130.0,3504.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15.0,350.0,165.0,3693.0,11.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,18.0,318.0,150.0,3436.0,11.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16.0,304.0,150.0,3433.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,302.0,140.0,3449.0,10.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
import numpy as np
shuffled_rows = np.random.permutation(cars.index)
shuffled_cars = cars.iloc[shuffled_rows]

cut_size = int(len(shuffled_cars)*0.7)
train = shuffled_cars[0:cut_size]
test = shuffled_cars[cut_size:]
print(train.shape[0])
print(test.shape[0])

274
118


In [7]:
from sklearn.linear_model import LogisticRegression

unique_origins = cars["origin"].unique()
unique_origins.sort()

models = {}

cols = []
for c in train.columns:
    if c.startswith('year_'):
        cols.append(c)
    elif c.startswith('cyl_'):
        cols.append(c)
        
for origin in unique_origins:
    lr = LogisticRegression()
    values = train['origin'] == origin
    lr.fit(train[cols], values)
    models[origin] = lr

models

{1: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 2: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 3: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False)}

In [8]:
testing_probs = pd.DataFrame(columns=unique_origins)

for origin in unique_origins:
    lr = models[origin]    
    testing_probs[origin] = lr.predict_proba(test[cols])[:,1]

testing_probs.head()

Unnamed: 0,1,2,3
0,0.388504,0.151869,0.468165
1,0.970607,0.020508,0.031601
2,0.95528,0.037236,0.024758
3,0.959533,0.021612,0.043226
4,0.874252,0.070962,0.062408


In [9]:
predicted_origins = testing_probs.idxmax(axis=1)
predicted_origins.head()

0    3
1    1
2    1
3    1
4    1
dtype: int64

In [14]:
lr = LogisticRegression()

lr.fit(train[cols], train['origin'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
lr.predict(test[cols])

array([3, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 3, 1, 1, 2, 1, 1, 3, 1, 1, 2, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 3, 1, 1, 3, 2, 2, 1, 1, 1, 1,
       3, 1, 1, 2, 3, 1, 2, 1, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1,
       1, 3, 1, 1, 3, 3, 2, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 1, 3, 3, 2, 1,
       2, 2, 1, 3, 2, 3, 3, 2, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 3, 1, 3, 2, 1,
       2, 1, 1])