In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection import train_test_split

%matplotlib inline

In [None]:
anscombe = pd.read_excel("data/Anscombe.xlsx")
anscombe

In [None]:
anscombe.describe()

In [None]:
for i in range(1,5):
    print("Correlation bw. x{0} and y{0} \n".format(i) , np.corrcoef(anscombe["x"+str(i)], anscombe["y"+str(i)]), "\n")

In [None]:
X = anscombe.x1.values.reshape(-1,1)
y = anscombe.y1.values

In [None]:
lr = LinearRegression()

lr.fit(X,y)
print(lr.coef_)
print(lr.intercept_)

In [None]:
plt.scatter(anscombe.x1, anscombe.y1);

In [None]:
plt.scatter(anscombe.x2, anscombe.y2);

In [None]:
plt.scatter(anscombe.x3, anscombe.y3);

In [None]:
plt.scatter(anscombe.x4, anscombe.y4);

### Sales - Age Problem

In [None]:
### Reading the Input File
df = pd.read_excel("data/Sales.xlsx")
df

In [None]:
np.corrcoef(df.age, df.sales)

In [None]:
X = df.iloc[:, 0].values.reshape(-1,1)
y = df.iloc[:, 1].values

In [None]:
### Model building
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 50)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_test_pred = lr.predict(X_test)
print("R2-Score", r2_score(y_test, y_test_pred))
print("Mean Square Error", mean_squared_error(y_test, y_test_pred))

In [None]:
### Building the model using a Ordinal Variable
df.age_cat.value_counts()

In [None]:
df.head()

In [None]:
X = df.iloc[:, 2].values.reshape(-1,1)
y = df.iloc[:, 1].values

In [None]:
### Model building
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 50)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_test_pred = lr.predict(X_test)
print("R2-Score", r2_score(y_test, y_test_pred))
print("Mean Square Error", mean_squared_error(y_test, y_test_pred))

### Will Visualization help in this case???
![](img/ques.jpg)

In [None]:
### Visualize the data
plt.scatter(df.age, df.sales)

In [None]:
### Visualize the data
plt.scatter(df.age_cat, df.sales)

In [None]:
### Visualize the data
df.boxplot(['sales'], by = 'age_cat')

### So whats next???
![](img/ques.jpg)

#### Model building using Nominal Variable(Dummy variables)

In [None]:
df['age_cat'] = df.age_cat.astype('category')
df = pd.get_dummies(df)

In [None]:
df

In [None]:
X = df.iloc[:, [2,3,4]].values
y= df.iloc[:, 1].values

In [None]:
### Model building
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 50)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_test_pred = lr.predict(X_test)
print("R2-Score", r2_score(y_test, y_test_pred))
print("Mean Square Error", mean_squared_error(y_test, y_test_pred))