### create a notebook named model.ipynb.



### 1)run all your previous scripts that acquired, prepared, split, and scaled your data.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score,explained_variance_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.linear_model import LinearRegression
from math import sqrt
import warnings
warnings.filterwarnings("ignore")
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

#import our scripts that do data science workflow
import wrangle
import split_scale_telco
import evaluate
import features

In [2]:
df=wrangle.wrangle_telco()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1685 entries, 0 to 1694
Data columns (total 4 columns):
customer_id        1685 non-null object
total_charges      1685 non-null float64
monthly_charges    1685 non-null float64
tenure             1685 non-null int64
dtypes: float64(2), int64(1), object(1)
memory usage: 65.8+ KB


In [3]:
train,test=split_scale_telco.split_my_data(df)

In [4]:
scaler, train_scaled, test_scaled=split_scale_telco.standard_scaler(train.drop(columns=['customer_id','total_charges']),test.drop(columns=['customer_id','total_charges']))

In [5]:
print(train_scaled)
print(test_scaled)

      monthly_charges    tenure
120         -0.349191  0.839851
1423        -1.184789 -1.700684
389          0.811120  0.839851
1137        -1.180459 -1.926509
1504        -1.036142  0.839851
438          0.646599  0.839851
571          1.102642 -0.232820
656          1.511060  0.726938
753          1.225311  0.839851
574          1.466321  0.839851
1215        -1.028926  0.839851
1053        -1.044801  0.501113
1131        -1.204993  0.726938
1253        -1.047687  0.839851
1031        -1.204993 -2.999179
772          1.490855 -0.006994
1402        -1.204993 -0.740926
516          0.461872  0.726938
1102        -1.047687  0.218831
1694        -1.199221 -0.628014
1086        -1.189119 -0.176363
1517        -1.040472  0.331744
133          0.336316  0.614025
1036        -1.193448  0.162375
72          -0.632053 -0.515101
551          0.513827  0.783394
1024        -1.053460 -0.289276
1432        -1.181903 -0.176363
895          0.669689  0.839851
1460        -1.106858 -0.853839
117     

In [6]:
X_train=train_scaled
y_train=train[['total_charges']]
X_test=test_scaled
y_test=test[['total_charges']]

### 2)select your features using your features script

In [7]:
features.optimal_number_of_features(X_train,y_train,X_test,y_test)

1

In [8]:
features.optimal_features(X_train,y_train,1)

Index(['monthly_charges'], dtype='object')

In [10]:
y_train

Unnamed: 0,total_charges
120,3580.95
1423,538.20
389,6376.55
1137,478.75
1504,1797.10
438,6096.90
571,5424.25
656,7939.25
753,7346.20
574,7965.95


### 3)fit 2 different linear models to your data

In [17]:
predictions=pd.DataFrame({'actual':y_train.total_charges}).reset_index(drop=True)
#model 1
lm1=LinearRegression()
lm1.fit(X_train[['monthly_charges']],y_train)
lm1_predictions=lm1.predict(X_train[['monthly_charges']])
predictions['lm1']=lm1_predictions

#model 2
lm2=LinearRegression()
lm2.fit(X_train[['tenure']],y_train)
lm2_predictions=lm2.predict(X_train[['tenure']])
predictions['lm2']=lm2_predictions

#baseline model
predictions['baseline'] = y_train.mean()[0]
#y_pred_baseline

actual      0
lm1         0
lm2         0
baseline    0
dtype: int64

### 4)evaluate the 2 models and your baseline.

In [19]:
MSE_baseline = mean_squared_error(predictions.actual, predictions.baseline)
SSE_baseline = MSE_baseline*len(predictions.actual)
RMSE_baseline = sqrt(MSE_baseline)
r2_baseline = r2_score(predictions.actual, predictions.baseline)
print(MSE_baseline,SSE_baseline,RMSE_baseline,r2_baseline)

6612927.089508685 8914225716.657707 2571.5612163642313 0.0


In [20]:
MSE_1 = mean_squared_error(predictions.actual, predictions.lm1)
SSE_1 = MSE_1*len(predictions.actual)
RMSE_1 = sqrt(MSE_1)
r2_1 = r2_score(predictions.actual, predictions.lm1)
print(MSE_1,SSE_1,RMSE_1,r2_1)

710410.3104002099 957633098.4194828 842.8584165802758 0.8925724870719858


In [21]:
MSE_2 = mean_squared_error(predictions.actual, predictions.lm2)
SSE_2 = MSE_2*len(predictions.actual)
RMSE_2 = sqrt(MSE_2)
r2_2 = r2_score(predictions.actual, predictions.lm2)
print(MSE_2,SSE_2,RMSE_2,r2_2)

3930986.41713009 5298969690.291361 1982.6715353608347 0.40556029668517835


### 5)select the one that performed the best.
The model with monthly charges performed the best

### 6)apply to your test data

In [None]:
model=lm1.predict(X_test[['monthly_charges']])
model=model.ravel().reshape(337)
y_test=np.array(y_test).ravel().reshape(337)
best_model=pd.DataFrame({'predictions':model,'total_charges':y_test})

best_model.head()

### 7)Write a function that creates the model object, fits and predicts,   given X_train, X_test, y_train, y_test

In [30]:
X_train1=X_train[['monthly_charges']]
X_test1=X_test[['monthly_charges']]
def modeling_function(X_train,X_test,y_train,y_test):
    predictions_train=pd.DataFrame({'actual':y_train.total_charges}).reset_index(drop=True)
    predictions_test=pd.DataFrame({'actual':y_test.total_charges}).reset_index(drop=True)
    #model 1
    lm1=LinearRegression()
    lm1.fit(X_train1,y_train)
    lm1_predictions=lm1.predict(X_train1)
    predictions_train['lm1']=lm1_predictions

    #model 2
    lm2=LinearRegression()
    lm2.fit(X_test1,y_test)
    lm2_predictions=lm2.predict(X_test1)
    predictions_test['lm2']=lm2_predictions
    
    return predictions_train,predictions_test

In [31]:
model_test,model_train=modeling_function(X_train1,X_test1,y_train,y_test)

In [33]:
model_test.head(20)
#model_train.head(20)

Unnamed: 0,actual,lm1
0,3580.95,2911.298265
1,538.2,881.205753
2,6376.55,5730.286831
3,478.75,891.724367
4,1797.1,1242.344836
5,6096.9,5330.579497
6,5424.25,6438.540177
7,7939.25,7430.796103
8,7346.2,6736.567576
9,7965.95,7322.103758


### 8)Write a function, plot_residuals(x, y, dataframe) that takes the feature, the target, and the dataframe as input and returns a residual plot.

In [None]:
def plot_residuals(x, y):
    '''
    Plots the residuals of a model that uses x to predict y. Note that we don't
    need to make any predictions ourselves here, seaborn will create the model
    and predictions for us under the hood with the `residplot` function.
    '''
    return sns.residplot(x, y)

x=test[['monthly_charges']]
y=test[['total_charges']]
plot_residuals(x,y)

### 9)Write a function, plot_regression(x, y) that takes a feature and a target and returns the datapoints, the regression line, and the confidence interval.

In [None]:
res = sm.OLS(y, x).fit()

In [None]:
res.summary()

In [None]:
prstd, iv_l, iv_u = wls_prediction_std(res)

fig, ax = plt.subplots(figsize=(8,6))

ax.plot(x, y, 'o', label="data")
#ax.plot(x, y, 'b-', label="True")
ax.plot(x, res.fittedvalues, 'r--.', label="OLS")
ax.plot(x, iv_u, 'r--')
ax.plot(x, iv_l, 'r--')
ax.legend(loc='best');

In [None]:
def plot_regression(x,y):
    res = sm.OLS(y, x).fit()
    prstd, iv_l, iv_u = wls_prediction_std(res)

    fig, ax = plt.subplots(figsize=(8,6))

    ax.plot(x, y, 'o', label="data")
    #ax.plot(x, y, 'b-', label="True")
    ax.plot(x, res.fittedvalues, 'r--.', label="OLS")
    ax.plot(x, iv_u, 'r--')
    ax.plot(x, iv_l, 'r--')
    ax.legend(loc='best');
    plt.show()
    

In [None]:
plot_regression(x,y)