In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('penguins_size.csv')
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [3]:
# Handling missing values

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

#setting strategy to 'most frequent' to impute by the mean
imputer = SimpleImputer(strategy='most_frequent')# strategy can also be mean or median 
df.iloc[:,:] = imputer.fit_transform(df)

In [4]:
lb = LabelEncoder()
df["sex"] = lb.fit_transform(df["sex"])
df['sex'][:5]

0    2
1    1
2    1
3    2
4    1
Name: sex, dtype: int64

In [5]:
lb = LabelEncoder()
df["island"] = lb.fit_transform(df["island"])
df['island'][:5]

0    2
1    2
2    2
3    2
4    2
Name: island, dtype: int64

In [6]:
lb = LabelEncoder()
df["species"] = lb.fit_transform(df["species"])
df['species'][:5]

0    0
1    0
2    0
3    0
4    0
Name: species, dtype: int64

In [7]:
df['species'].value_counts()

0    152
2    124
1     68
Name: species, dtype: int64

# Linear regression

In [8]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [71]:
X = df.loc[:, ['body_mass_g', 'flipper_length_mm', 'culmen_length_mm', 'island']]
y = df.iloc[:, 0]

print(X)
print(y)

print(X.shape)
print(y.shape)

     body_mass_g  flipper_length_mm  culmen_length_mm  island
0         3750.0              181.0              39.1       2
1         3800.0              186.0              39.5       2
2         3250.0              195.0              40.3       2
3         3800.0              190.0              41.1       2
4         3450.0              193.0              36.7       2
..           ...                ...               ...     ...
339       3800.0              190.0              41.1       0
340       4850.0              215.0              46.8       0
341       5750.0              222.0              50.4       0
342       5200.0              212.0              45.2       0
343       5400.0              213.0              49.9       0

[344 rows x 4 columns]
0      0
1      0
2      0
3      0
4      0
      ..
339    2
340    2
341    2
342    2
343    2
Name: species, Length: 344, dtype: int64
(344, 4)
(344,)


In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(X_train, y_train)

pipe.score(X_test, y_test)

0.9565217391304348

# Feature selection

In [70]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


bestfeatures = SelectKBest(score_func=chi2, k='all')
fit = bestfeatures.fit(X_train, y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(10,'Score'))  #print 10 best features

               Specs         Score
0        body_mass_g  29596.401773
1  flipper_length_mm    212.431851
2   culmen_length_mm    139.690508
3             island    101.750702
