# Crab Ages notebook

#### Download the datasets and save them in a similar folder to that of your project notebook
#### Run the necessary libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9
1,1,I,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
2,2,M,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9
3,3,F,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11
4,4,I,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8


In [3]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight
0,74051,I,1.05,0.7625,0.275,8.618248,3.657085,1.729319,2.721552
1,74052,I,1.1625,0.8875,0.275,15.507176,7.030676,3.246018,3.96893
2,74053,F,1.2875,0.9875,0.325,14.571643,5.556502,3.883882,4.819415
3,74054,F,1.55,0.9875,0.3875,28.377849,13.380964,6.548735,7.030676
4,74055,I,1.1125,0.85,0.2625,11.765042,5.528153,2.466407,3.331066


In [4]:
test_id = test['id']

In [5]:
train['Sex'].value_counts()

M    27084
I    23957
F    23010
Name: Sex, dtype: int64

#### Convert the sex column in both the test and train dataset from objects to integers

In [6]:
train['Sex'] = train['Sex'].replace({'M':0,'F':1,'I':2})
test['Sex'] = test['Sex'].replace({'M':0,'F':1,'I':2})

In [7]:
train.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age
0,0,2,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9
1,1,2,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
2,2,0,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9
3,3,1,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11
4,4,2,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8


#### Drop the id column in both the test and train dataset as it is irrelevant in this model formulation section

In [8]:
test = test.drop('id',axis=1)
train = train.drop('id',axis=1)

#### Splitting of the datasets

In [9]:
X_train = train.drop('Age',axis=1)
y_train = train['Age']
X_test = test

## Decision Tree Model

In [10]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(X_train,y_train)

In [11]:
predictions = model.predict(X_test)
Accuracy = model.score(X_train,y_train)*100
Accuracy

100.0

## Random Forest model

In [12]:
from sklearn.ensemble import RandomForestRegressor
model2 = RandomForestRegressor()
model2.fit(X_train,y_train)

In [13]:
model2.predict(X_test)
Rand_Accuracy = model2.score(X_train,y_train)*100
Rand_Accuracy

93.78870671885463

## Linear Regression model

In [14]:
from sklearn.linear_model import LinearRegression
model3 = LinearRegression()
model3.fit(X_train,y_train)

In [15]:
model3.predict(X_test)
Lr_Accuracy = model3.score(X_train,y_train)*100
Lr_Accuracy

54.5206595764406

## Making a sample submission using the best performing model

In [16]:
Predictions = pd.DataFrame(model.predict(X_test), columns=['Predictions'])
Id = pd.DataFrame(test_id, columns = ['id'])
Model_Predictions = pd.concat([Id,Predictions],axis=1)

In [17]:
Model_Predictions.to_csv('Crab_Ages.csv', index=False)