In [4]:
#importing the libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [5]:
#Create dataframe
star_df=pd.read_csv('star_type_.csv')
star_df


Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type
0,3068,0.002400,0.1700,16.12,Brown Dwarf
1,3042,0.000500,0.1542,16.60,Brown Dwarf
2,2600,0.000300,0.1020,18.70,Brown Dwarf
3,2800,0.000200,0.1600,16.65,Brown Dwarf
4,1939,0.000138,0.1030,20.06,Brown Dwarf
...,...,...,...,...,...
235,38940,374830.000000,1356.0000,-9.93,Supergiant
236,30839,834042.000000,1194.0000,-10.63,Supergiant
237,8829,537493.000000,1423.0000,-10.73,Supergiant
238,9235,404940.000000,1112.0000,-11.23,Supergiant


In [6]:
#Target count
star_df['Star type'].value_counts()
#Fetch generic info
star_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Temperature (K)         240 non-null    int64  
 1   Luminosity(L/Lo)        240 non-null    float64
 2   Radius(R/Ro)            240 non-null    float64
 3   Absolute magnitude(Mv)  240 non-null    float64
 4   Star type               240 non-null    object 
dtypes: float64(3), int64(1), object(1)
memory usage: 9.5+ KB


**Training**

In [7]:
X= star_df.iloc[:,:-1]
y= star_df['Star type']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

**Create the pipeline to scale i.e preprocess to train the model faster**

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
#Create pipeline with standard scaler and logistic regression model
pipeline= Pipeline([
    ('scaler',StandardScaler()),
    ('classifier', LogisticRegression(solver='newton-cg', multi_class='multinomial'))
])

In [10]:
# Train the logistic regression model using pipeline
pipeline.fit(X_train,y_train)



In [11]:
#make prediction
preds=pipeline.predict(X_test)
actual=y_test.values
print(f'This is the actual output - \n {actual}')
print(f'This is the predicted output - \n {preds}')

This is the actual output - 
 ['White Dwarf' 'Brown Dwarf' 'Main Sequence' 'Hypergiant' 'Hypergiant'
 'Supergiant' 'Supergiant' 'White Dwarf' 'Brown Dwarf' 'White Dwarf'
 'Hypergiant' 'White Dwarf' 'Supergiant' 'Hypergiant' 'Supergiant'
 'Supergiant' 'Brown Dwarf' 'Red Dwarf' 'Main Sequence' 'Brown Dwarf'
 'Brown Dwarf' 'Red Dwarf' 'Supergiant' 'Main Sequence' 'Supergiant'
 'Main Sequence' 'Red Dwarf' 'White Dwarf' 'Supergiant' 'Main Sequence'
 'Main Sequence' 'Hypergiant' 'White Dwarf' 'Brown Dwarf' 'Red Dwarf'
 'Brown Dwarf' 'Red Dwarf' 'Supergiant' 'Red Dwarf' 'Supergiant'
 'Hypergiant' 'Supergiant' 'Hypergiant' 'Red Dwarf' 'Main Sequence'
 'Brown Dwarf' 'Hypergiant' 'Main Sequence']
This is the predicted output - 
 ['White Dwarf' 'Brown Dwarf' 'Red Dwarf' 'Hypergiant' 'Hypergiant'
 'Supergiant' 'Supergiant' 'White Dwarf' 'Brown Dwarf' 'White Dwarf'
 'Hypergiant' 'White Dwarf' 'Supergiant' 'Hypergiant' 'Supergiant'
 'Supergiant' 'Brown Dwarf' 'Red Dwarf' 'Main Sequence' 'Brown Dwarf

In [12]:
#check on which inices prediction did not match
incorrect_indexes=np.where(actual !=preds)[0]
for i in incorrect_indexes:
  print(f'Actual output was {actual[i]} and predicted was {preds[i]}')

Actual output was Main Sequence and predicted was Red Dwarf
Actual output was Main Sequence and predicted was Hypergiant


In [13]:
#Accuracy score
from sklearn.metrics import accuracy_score
print(accuracy_score(actual,preds))

0.9583333333333334


**Download the Pipeline**

In [14]:
from pickle import dump
with open('pipeline.pkl','wb') as file:
 dump(pipeline,file)

In [15]:
#load the pipeline and test it
from pickle import load
with open('pipeline.pkl','rb') as file:
 pipeline_test =load(file)

In [16]:
#get data from test set
X_test.iloc[1,:]

Temperature (K)           2637.00000
Luminosity(L/Lo)             0.00073
Radius(R/Ro)                 0.12700
Absolute magnitude(Mv)      17.22000
Name: 6, dtype: float64

In [17]:
#featurelist
features=X_test.columns.to_list()
#predict a random case
test_data=pd.DataFrame([[40000,0.00073,0.127,17.22]], columns=features)
output=pipeline_test.predict(test_data)
print(output)

['White Dwarf']
