# Abalone linear regression


# The age of abalone is determined by cutting the shell through the cone, staining it, and counting the number of rings through a microscope — a boring and time-consuming task. We want to predict the age using different physical measurements which is easier to measure. The age of abalone is ( number of rings +1.5) years.

In [1]:
import pandas as pd
from collections import Counter 
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv("abalone.csv")

In [3]:
df.head(10)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
5,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20
7,F,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16
8,M,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9
9,F,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,19


In [4]:
df.describe()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   object 
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole weight    4177 non-null   float64
 5   Shucked weight  4177 non-null   float64
 6   Viscera weight  4177 non-null   float64
 7   Shell weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [6]:
X=df.drop('Rings',axis=1)
y = df['Rings']

In [7]:
X.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055


In [8]:
Counter(y)

Counter({15: 103,
         7: 391,
         9: 689,
         10: 634,
         8: 568,
         20: 26,
         16: 67,
         19: 32,
         14: 126,
         11: 487,
         12: 267,
         18: 42,
         13: 203,
         5: 115,
         4: 57,
         6: 259,
         21: 14,
         17: 58,
         22: 6,
         1: 1,
         3: 15,
         26: 1,
         23: 9,
         29: 1,
         2: 1,
         27: 2,
         25: 1,
         24: 2})

In [9]:
set(X['Sex'])

{'F', 'I', 'M'}

In [10]:
enc=LabelEncoder()
X['Sex']=enc.fit_transform(X['Sex'])
set(X['Sex'])

{0, 1, 2}

In [11]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [12]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.25)
len(X_train)

3132

In [13]:
len(X_test)

1045

In [14]:
X_train.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
940,1,0.46,0.345,0.105,0.449,0.196,0.0945,0.1265
2688,2,0.63,0.465,0.15,1.027,0.537,0.188,0.176
1948,2,0.635,0.515,0.165,1.229,0.5055,0.2975,0.3535
713,2,0.355,0.265,0.085,0.201,0.069,0.053,0.0695
3743,0,0.705,0.555,0.195,1.7525,0.7105,0.4215,0.516


In [15]:
clf=GaussianNB()
clf.fit(X_train,y_train)

In [16]:
y_pred = clf.predict(X_test) 
accuracy_score(y_test, y_pred) * 100

26.02870813397129

In [17]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           3       0.50      1.00      0.67         7
           4       0.30      0.62      0.40        13
           5       0.27      0.42      0.33        40
           6       0.32      0.43      0.36        63
           7       0.26      0.36      0.30       114
           8       0.27      0.29      0.28       139
           9       0.25      0.30      0.27       152
          10       0.21      0.24      0.23       139
          11       0.26      0.42      0.32       121
          12       0.50      0.01      0.02        93
          13       0.00      0.00      0.00        51
          14       0.00      0.00      0.00        32
          15       0.00      0.00      0.00        22
          16       0.00      0.00      0.00        16
          17       0.00      0.00      0.00        12
          18       0.00      0.00      0.00         6
          19       0.00      0.00      0.00        10
          20       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
reg = LinearRegression()
reg.fit(X_train,y_train)

In [19]:
y_pred=reg.predict(X_test)
y_pred

array([13.10451425,  9.66747548, 10.35605247, ...,  9.95962005,
       12.59111443, 12.18516586])

In [20]:
mean_absolute_error(y_test,y_pred)

1.5955158378194014

In [21]:
r2_score(y_test,y_pred)

0.535415850189408