# Training the AI model

In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

### Load the data

In [52]:
# load the data
data = pd.read_csv('../data/clean_data.csv')

In [53]:
data.head()

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3068.0,0.0024,0.17,16.12,0.0,10,M
1,3042.0,0.0005,0.1542,16.6,0.0,10,M
2,2600.0,0.0003,0.102,18.7,0.0,10,M
3,2800.0,0.0002,0.16,16.65,0.0,10,M
4,10467.934498,0.000138,0.103,20.06,0.0,10,M


In [54]:
# encode the target variable
le = LabelEncoder()
data['Spectral Class'] = le.fit_transform(data['Spectral Class'])

In [55]:
data.head()

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3068.0,0.0024,0.17,16.12,0.0,10,4
1,3042.0,0.0005,0.1542,16.6,0.0,10,4
2,2600.0,0.0003,0.102,18.7,0.0,10,4
3,2800.0,0.0002,0.16,16.65,0.0,10,4
4,10467.934498,0.000138,0.103,20.06,0.0,10,4


### Normalize the data

In [56]:
# normalize the data except the target variable
scaler = StandardScaler()
data.iloc[:, :-1] = scaler.fit_transform(data.iloc[:, :-1])

data.head()

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,-1.004128,-0.631904,-0.487763,1.465091,-2.280854,0.469459,4
1,-1.007181,-0.631904,-0.487798,1.520221,-2.280854,0.469459,4
2,-1.05908,-0.631904,-0.487913,1.761414,-2.280854,0.469459,4
3,-1.035596,-0.631904,-0.487785,1.525964,-2.280854,0.469459,4
4,-0.135226,-0.631904,-0.487911,1.917615,-2.280854,0.469459,4


In [57]:
data.describe()

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
count,696.0,696.0,696.0,696.0,696.0,696.0,696.0
mean,-2.0417890000000002e-17,2.0417890000000002e-17,-5.104474e-17,-4.083579e-17,-2.654326e-16,5.1044740000000004e-18,2.5
std,1.000719,1.000719,1.000719,1.000719,1.000719,1.000719,1.709053
min,-1.05908,-0.6319038,-0.4881187,-1.75541,-2.280854,-1.422512,0.0
25%,-0.7949489,-0.6319038,-0.4878821,-0.9940547,-0.68358,-1.044118,1.0
50%,-0.304099,-0.631515,-0.4844446,0.1004074,0.1150572,0.2802618,2.5
75%,0.3109748,0.5233951,-0.2989793,0.9843792,0.910485,0.6586561,4.0
max,3.332434,4.324329,3.796782,1.917615,1.712332,1.983036,5.0


### Split the data into training and test sets

In [58]:
# split the data into train and test
X = data.drop(['Spectral Class'], axis=1)
y = data['Spectral Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Create a Logistic Regression model

In [59]:
LR_model = LogisticRegression(multi_class='multinomial', solver='newton-cg', verbose = 1, max_iter = 1000)

### Train the model

In [60]:
LR_model.fit(X_train, y_train)

### Evaluate the model

In [61]:
print('Train score:', LR_model.score(X_train, y_train))
print('Test score:', LR_model.score(X_test, y_test))

Train score: 0.8255395683453237
Test score: 0.8214285714285714


### Save the model

In [62]:
# save the model
import joblib
joblib.dump(LR_model, '../models/LogisticRegression_model.pkl')

['../models/LogisticRegression_model.pkl']