# ACCURACY FOR DIFFERENT CASES OF TRAIN-TEST SPLIT

## Linear Regression Data

### Importing Needed Libraries

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn import linear_model

### Reading the Data

In [2]:
df = pd.read_csv("bottle1.csv")
cdf = df[['Salnty','T_degC']]
cdf.head()

Unnamed: 0,Salnty,T_degC
0,33.44,10.5
1,33.44,10.46
2,33.437,10.46
3,33.42,10.45
4,33.421,10.45


In [3]:
# Limiting amount of entries to speed up regression time
cdf = cdf[:][:500]
len(cdf)

500

## Creating Different cases of Test and Train Dataset

In [4]:
# Eliminating NaN or missing input numbers and filling it
cdf.fillna(method='ffill', inplace=True)

In [5]:
X = np.asarray(cdf['Salnty']).reshape(-1,1)
X[0:5]

array([[33.44 ],
       [33.44 ],
       [33.437],
       [33.42 ],
       [33.421]])

In [6]:
y = np.asarray(cdf['T_degC']).reshape(-1,1)
y[0:5]

array([[10.5 ],
       [10.46],
       [10.46],
       [10.45],
       [10.45]])

In [7]:
# Pandas dropna() allows the user to analyze and drop Rows/Columns with Null values in different ways.
cdf.dropna(inplace=True)

In [8]:
# TRAIN/TEST SPLIT:

for i in np.arange (0.55, 0.95, 0.05):
    
    X_train, X_test, y_train, y_test = train_test_split (X,y, test_size=(1-i), random_state=4)
    print("\nTRAIN/TEST SIZES FOR i= %f:" %i)
    print('\nTrain size for train= %f : ' %i, X_train.shape, y_train.shape)
    print('Test size for train= %f: ' %i,X_test.shape, y_test.shape)
    print()
    
    #MODELING:
    print("\nMODELING THE DATA FOR i=%f: " %i)
    regr = linear_model.LinearRegression()
    regr.fit (X_train, y_train)   # Best fit line for the regression model

    # The coefficients
    print ('\nCoefficients for train= %f: ' %i, regr.coef_)
    print ('Intercept for train= %f: ' %i, regr.intercept_)
    print()
    
    #PREDICTING:
    print("\nPREDICTING Y VALUES FOR i= %f: " %i)
    y_pred = regr.predict(X_test)
    print("\n",y_pred[0:3])
    print()
    
    #ACCURACY:
    print("\n ACCURACY FOR i= %f is : " %i)
    acc = regr.score(X_test, y_test)
    print(acc)
    print()


TRAIN/TEST SIZES FOR i= 0.550000:

Train size for train= 0.550000 :  (275, 1) (275, 1)
Test size for train= 0.550000:  (225, 1) (225, 1)


MODELING THE DATA FOR i=0.550000: 

Coefficients for train= 0.550000:  [[-4.82572734]]
Intercept for train= 0.550000:  [170.12353733]


PREDICTING Y VALUES FOR i= 0.550000: 

 [[12.41876785]
 [11.55013693]
 [12.08096694]]


 ACCURACY FOR i= 0.550000 is : 
0.8268929289009727


TRAIN/TEST SIZES FOR i= 0.600000:

Train size for train= 0.600000 :  (300, 1) (300, 1)
Test size for train= 0.600000:  (200, 1) (200, 1)


MODELING THE DATA FOR i=0.600000: 

Coefficients for train= 0.600000:  [[-4.8294804]]
Intercept for train= 0.600000:  [170.27509432]


PREDICTING Y VALUES FOR i= 0.600000: 

 [[12.44767469]
 [11.57836822]
 [12.10961106]]


 ACCURACY FOR i= 0.600000 is : 
0.8294942066299138


TRAIN/TEST SIZES FOR i= 0.650000:

Train size for train= 0.650000 :  (325, 1) (325, 1)
Test size for train= 0.650000:  (175, 1) (175, 1)


MODELING THE DATA FOR i=0.650

In [9]:
 #APPENDING TRAIN SIZE INTO A LIST:

train_size = []
for i in np.arange(0.55,0.95,0.05):
    train_size.append(i)
    
print("Train Size list: " ,train_size)

Train Size list:  [0.55, 0.6000000000000001, 0.6500000000000001, 0.7000000000000002, 0.7500000000000002, 0.8000000000000003, 0.8500000000000003, 0.9000000000000004]


In [10]:
#APPENDING THE ACCURACY INTO A LIST

accuracy= []

for i in np.arange(0.55,0.95,0.05):
    
    X_train, X_test, y_train, y_test = train_test_split (X,y, test_size=(1-i), random_state=4)
    regr = linear_model.LinearRegression()
    regr.fit (X_train, y_train)   
    acc = regr.score(X_test, y_test)
    accuracy.append(acc)
    
print("Accuracy list: ",accuracy)

Accuracy list:  [0.8268929289009727, 0.8294942066299138, 0.8291089177079705, 0.8214741410038493, 0.8208642308186584, 0.8093114854419466, 0.8026764925718183, 0.8353087066779801]


## DataFrame for train size and Accuracy

In [11]:
dict = {'Train_Size':train_size, 'Accuracy':accuracy}
table = pd.DataFrame(dict)
table

Unnamed: 0,Train_Size,Accuracy
0,0.55,0.826893
1,0.6,0.829494
2,0.65,0.829109
3,0.7,0.821474
4,0.75,0.820864
5,0.8,0.809311
6,0.85,0.802676
7,0.9,0.835309


In [12]:
#SORTING VALUES:
print("Sorting the Accuracy in descending order: ")
table.sort_values(by='Accuracy',ascending=False,inplace=True)
table

Sorting the Accuracy in descending order: 


Unnamed: 0,Train_Size,Accuracy
7,0.9,0.835309
1,0.6,0.829494
2,0.65,0.829109
0,0.55,0.826893
3,0.7,0.821474
4,0.75,0.820864
5,0.8,0.809311
6,0.85,0.802676


In [13]:
#MAXIMUM ACCURACY:
print("The Maximum Accuracy is: ")
table.loc[[7],['Train_Size','Accuracy']]

The Maximum Accuracy is: 


Unnamed: 0,Train_Size,Accuracy
7,0.9,0.835309


# THE END