Question 1:

In [7]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'mckayla-data-445-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to read from s3 bucket
file_key = 'weather.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

##Reading the csv file
weather = pd.read_csv(file_content_stream)
weather.head()

Unnamed: 0,rowID,hpwren_timestamp,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,min_wind_direction,min_wind_speed,rain_accumulation,rain_duration,relative_humidity
0,0,2011-09-10 00:00:49,912.3,64.76,97.0,1.2,106.0,1.6,85.0,1.0,,,60.5
1,1,2011-09-10 00:01:49,912.3,63.86,161.0,0.8,215.0,1.5,43.0,0.2,0.0,0.0,39.9
2,2,2011-09-10 00:02:49,912.3,64.22,77.0,0.7,143.0,1.2,324.0,0.3,0.0,0.0,43.0
3,3,2011-09-10 00:03:49,912.3,64.4,89.0,1.2,112.0,1.6,12.0,0.7,0.0,0.0,49.5
4,4,2011-09-10 00:04:49,912.3,64.4,185.0,0.4,260.0,1.0,100.0,0.1,0.0,0.0,58.8


In [8]:
## Removing the observations with N/As
weather = weather.dropna()

In [9]:
## Remove variable
weather = weather.drop(columns = 'rowID', axis = 1)
weather = weather.drop(columns = 'hpwren_timestamp', axis = 1)
weather = weather.drop(columns = 'min_wind_speed', axis = 1)
weather = weather.drop(columns = 'min_wind_direction', axis = 1)
weather = weather.drop(columns = 'rain_accumulation', axis = 1)
weather = weather.drop(columns = 'rain_duration', axis = 1)
weather.head()

Unnamed: 0,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,relative_humidity
1,912.3,63.86,161.0,0.8,215.0,1.5,39.9
2,912.3,64.22,77.0,0.7,143.0,1.2,43.0
3,912.3,64.4,89.0,1.2,112.0,1.6,49.5
4,912.3,64.4,185.0,0.4,260.0,1.0,58.8
5,912.3,63.5,76.0,2.5,92.0,3.0,62.6


In [10]:
## Scaling
scaler = MinMaxScaler()

weather[['air_pressure_0_1','air_temp_0_1', 'avg_wind_direction_0_1', 'avg_wind_speed', 'max_wind_direction_0_1', 'max_wind_speed_0_1', 'relative_humidity_0_1']] = scaler.fit_transform(weather)
weather.head(10)

Unnamed: 0,air_pressure,air_temp,avg_wind_direction,avg_wind_speed,max_wind_direction,max_wind_speed,relative_humidity,air_pressure_0_1,air_temp_0_1,avg_wind_direction_0_1,max_wind_direction_0_1,max_wind_speed_0_1,relative_humidity_0_1
1,912.3,63.86,161.0,0.024768,215.0,1.5,39.9,0.297959,0.474801,0.448468,0.598886,0.038997,0.424702
2,912.3,64.22,77.0,0.021672,143.0,1.2,43.0,0.297959,0.480106,0.214485,0.398329,0.030641,0.458288
3,912.3,64.4,89.0,0.037152,112.0,1.6,49.5,0.297959,0.482759,0.247911,0.311978,0.041783,0.528711
4,912.3,64.4,185.0,0.012384,260.0,1.0,58.8,0.297959,0.482759,0.51532,0.724234,0.02507,0.629469
5,912.3,63.5,76.0,0.077399,92.0,3.0,62.6,0.297959,0.469496,0.211699,0.256267,0.08078,0.670639
6,912.3,62.78,79.0,0.074303,89.0,2.7,65.6,0.297959,0.458886,0.220056,0.247911,0.072423,0.703142
7,912.3,62.42,86.0,0.06192,92.0,2.4,65.2,0.297959,0.453581,0.239554,0.256267,0.064067,0.698808
8,912.3,62.24,105.0,0.043344,125.0,1.9,65.8,0.297959,0.450928,0.292479,0.348189,0.050139,0.705309
9,912.3,62.24,93.0,0.012384,126.0,0.7,58.6,0.297959,0.450928,0.259053,0.350975,0.016713,0.627302
10,912.3,62.24,144.0,0.037152,167.0,1.8,38.5,0.297959,0.450928,0.401114,0.465181,0.047354,0.409534


In [None]:
## Defining a list to store silhoutte scores
silhouette = list()

for i in range (2, 21):
    
    ## Clustering the data into i cluster
    kmeans_md = KMeans(n_clusters = i, n_init = 20).fit(weather[['air_pressure_0_1','air_temp_0_1', 'avg_wind_direction_0_1', 'avg_wind_speed', 'max_wind_direction_0_1', 'max_wind_speed_0_1', 'relative_humidity_0_1']])
    kmeans_md_labels = kmeans_md.labels_
    
    ## computing the silhoette score
    score = silhouette_score(weather[['air_pressure_0_1','air_temp_0_1', 'avg_wind_direction_0_1', 'avg_wind_speed', 'max_wind_direction_0_1', 'max_wind_speed_0_1', 'relative_humidity_0_1']], kmeans_md_labels)
    silhouette.append(score)
    
## Visualizing the results
import matplotlib.pyplot as plt
    
plt.plot(range(2, 21), silhouette)
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.grid()
plt.show();

In [None]:
## Clustering the data
four_clusters = KMeans(n_clusters = 4, n_init = 20).fit(customers[['Visit_Time_0_1', 'Average_Expense_0_1', 'Sex_0_1', 'Age_0_1']])

## Appending cluster labels
customers['cluster'] = four_clusters.labels_
customers.head(10)

In [None]:
## Cluster 0
cluster_0 = weather[weather['cluster'] == 0]
cluster_0.describe()

Question 2:

In [23]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from itertools import product


## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'mckayla-data-445-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to read from s3 bucket
file_key = 'churn-bigml-80.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

telecom_train = pd.read_csv(file_content_stream)
telecom_train.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [24]:
## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'mckayla-data-445-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to read from s3 bucket
file_key = 'churn-bigml-20.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

telecom_test = pd.read_csv(file_content_stream)
telecom_test.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn
0,LA,117,408,No,No,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,False
1,IN,65,415,No,No,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,True
2,NY,161,415,No,No,0,332.9,67,56.59,317.8,97,27.01,160.6,128,7.23,5.4,9,1.46,4,True
3,SC,111,415,No,No,0,110.4,103,18.77,137.3,102,11.67,189.6,105,8.53,7.7,6,2.08,2,False
4,HI,49,510,No,No,0,119.3,117,20.28,215.1,109,18.28,178.7,90,8.04,11.1,1,3.0,1,False


In [25]:
## Creating the variable Churn_numb
telecom_train['Churn_numb'] = np.where(telecom_train['Churn'] == False, 0, 1)
telecom_test['Churn_numb'] =  np.where(telecom_test['Churn'] == False, 0, 1)

In [27]:
## Changing International_plan to a numerical variable
telecom_train['International_plan'] = np.where(telecom_train['International_plan'] == 'No', 0, 1)
telecom_test['International_plan'] = np.where(telecom_test['International_plan'] == 'No', 0, 1)

In [28]:
## Changing Voice_mail_plan to a numerical variable
telecom_train['Voice_mail_plan'] = np.where(telecom_train['Voice_mail_plan'] == 'No', 0, 1)
telecom_test['Voice_mail_plan'] = np.where(telecom_test['Voice_mail_plan'] == 'No', 0, 1)

In [29]:
## Creating a new variable - total_charge
telecom_train['total_charge'] = telecom_train['Total_day_charge'] + telecom_train['Total_eve_charge'] + telecom_train['Total_night_charge'] + telecom_train['Total_intl_charge']
telecom_test['total_charge'] = telecom_test['Total_day_charge'] + telecom_test['Total_eve_charge'] + telecom_test['Total_night_charge'] + telecom_test['Total_intl_charge']

In [30]:
telecom_train.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn,Churn_numb,total_charge
0,KS,128,415,1,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False,0,75.56
1,OH,107,415,1,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False,0,59.24
2,NJ,137,415,1,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False,0,62.29
3,OH,84,408,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False,0,66.8
4,OK,75,415,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False,0,52.09


In [31]:
telecom_test.head()

Unnamed: 0,State,Account_length,Area_code,International_plan,Voice_mail_plan,Number_vmail_messages,Total_day_minutes,Total_day_calls,Total_day_charge,Total_eve_minutes,Total_eve_calls,Total_eve_charge,Total_night_minutes,Total_night_calls,Total_night_charge,Total_intl_minutes,Total_intl_calls,Total_intl_charge,Customer_service_calls,Churn,Churn_numb,total_charge
0,LA,117,408,1,0,0,184.5,97,31.37,351.6,80,29.89,215.8,90,9.71,8.7,4,2.35,1,False,0,73.32
1,IN,65,415,1,0,0,129.1,137,21.95,228.5,83,19.42,208.8,111,9.4,12.7,6,3.43,4,True,1,54.2
2,NY,161,415,1,0,0,332.9,67,56.59,317.8,97,27.01,160.6,128,7.23,5.4,9,1.46,4,True,1,92.29
3,SC,111,415,1,0,0,110.4,103,18.77,137.3,102,11.67,189.6,105,8.53,7.7,6,2.08,2,False,0,41.05
4,HI,49,510,1,0,0,119.3,117,20.28,215.1,109,18.28,178.7,90,8.04,11.1,1,3.0,1,False,0,49.6


In [32]:
## Keeping these variables
telecom_train = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls', 'Churn_numb']]
telecom_train.head()

Unnamed: 0,Account_length,International_plan,Voice_mail_plan,total_charge,Customer_service_calls,Churn_numb
0,128,1,1,75.56,1,0
1,107,1,1,59.24,1,0
2,137,1,0,62.29,0,0
3,84,1,0,66.8,2,0
4,75,1,0,52.09,3,0


In [33]:
## Keeping these variables
telecom_train = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls', 'Churn_numb']]
telecom_train.head()

Unnamed: 0,Account_length,International_plan,Voice_mail_plan,total_charge,Customer_service_calls,Churn_numb
0,128,1,1,75.56,1,0
1,107,1,1,59.24,1,0
2,137,1,0,62.29,0,0
3,84,1,0,66.8,2,0
4,75,1,0,52.09,3,0


In [34]:
## Defining the input and target variables
X = telecom_train[['Account_length', 'International_plan', 'Voice_mail_plan', 'total_charge', 'Customer_service_calls']]
Y = telecom_train['Churn_numb']

## Splitting the data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify = Y)

In [35]:
for i in range (0,1000):
    
    ## Building the random forest model
    RF_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)

    ## Extracting the importances
    RF_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': RF_md.feature_importances_})
    RF_importances = RF_importances.sort_values(by = 'Importance', ascending = False)
RF_importances

Unnamed: 0,Feature,Importance
3,total_charge,0.656408
4,Customer_service_calls,0.251862
2,Voice_mail_plan,0.064801
0,Account_length,0.02693
1,International_plan,0.0


In [39]:
for i in range (0, 1000):
    
    ## Building the AdaBoost Model
    ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)

    ## Extracting the feature importances
    ada_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': ada_md.feature_importances_})
    ada_importances = ada_importances.sort_values(by = 'Importance', ascending = False)
ada_importances

KeyboardInterrupt: 

In [38]:
for i in range (0, 1000):
    
    ## Building the GradientBoost Model
    gb_md = GradientBoostingClassifier(max_depth = 3, n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)

    ## Extracting the feature importances
    gb_importances = pd.DataFrame({'Feature': X_train.columns, 'Importance': gb_md.feature_importances_})
    gb_importances = gb_importances.sort_values(by = 'Importance', ascending = False)
gb_importances

Unnamed: 0,Feature,Importance
3,total_charge,0.66622
4,Customer_service_calls,0.184615
2,Voice_mail_plan,0.137145
0,Account_length,0.012019
1,International_plan,0.0


In [None]:
## Average importances for all models
importances_avg = (RF_importances + ada_importances +gb_importances / 3)
importances_avg = pd.DataFrame({'Feature': X_train.columns, 'Importance': RF_md.feature_importances_})
importances_avg = importances_avg.sort_values(by = 'Importance', ascending = False)
importances_avg

In [None]:
## Defining the input and target variables
X = telecom_train[[]]
Y = telecom_train['Churn_numb']

## Splitting the data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify = Y)

In [None]:
def expand_grid(dictionary):
    return pd.DataFrame([row for row in product(*dictionary.values())], columns = dictionary.keys())

dictionary = {'n_tree': [100, 500, 1000, 1500, 2000], 'depth': [3, 5, 7]}

RF_parameters = expand_grid(dictionary)

## Building the random forest model
RF_md = RandomForestClassifier(RF_parameters).fit(X_train, Y_train)

## Predicting on test
RF_pred = RF_md.predict_proba(X_test)[:, 1]

## Changing likelihoods to labels
RF_labels = np.where(RF_pred < 0.1, 0, 1)

## Computing the recall score
RF_recall = recall_score(Y_test, RF_labels)
print('The recall score for the first random forest model is', RF_recall)

## Computing the accuracy score
RF_accuracy = accuracy_score(Y_test, RF_labels)
print('The accuracy score for the first random forest model is', RF_accuracy)

In [None]:
def expand_grid(dictionary):
    return pd.DataFrame([row for row in product(*dictionary.values())], columns = dictionary.keys())

dictionary = {'n_tree': [100, 500, 1000, 1500, 2000], 'depth': [3, 5, 7], 'learning_rate': [0.1, 0.01, 0.001]}

ada_parameters = expand_grid(dictionary)

## Building the AdaBoost model
ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(ada_parameters).fit(X_train, Y_train)

## Predicting on test
ada_pred = ada_md.predict_proba(X_test)[:, 1]

## Changing likelihoods to labels
ada_labels = np.where(ada_pred < 0.1, 0, 1)

## Computing the recall score
ada_recall = recall_score(Y_test, ada_labels)
print('The recall score for the first adaboost model is', ada_recall)

## Computing the accuracy score
ada_accuracy = accuracy_score(Y_test, ada_labels)
print('The accuracy score for the first adaboost model is', ada_accuracy)   

In [None]:
def expand_grid(dictionary):
    return pd.DataFrame([row for row in product(*dictionary.values())], columns = dictionary.keys())

dictionary = {'n_tree': [100, 500, 1000, 1500, 2000], 'depth': [3, 5, 7], 'learning_rate': [0.1, 0.01, 0.001]}

gb_parameters = expand_grid(dictionary)

## Building the Gradient Boosting model
gb_md = GradientBoostingClassifier(gb_parameters).fit(X_train, Y_train)

## Predicting on test
gb_pred = gb_md.predict_proba(X_test)[:, 1]

## Changing likelihoods to labels
gb_labels = np.where(gb_pred < 0.1, 0, 1)

## Computing the recall score
gb_recall = recall_score(Y_test, gb_labels)
print('The recall score for the first gradient boosting model is', gb_recall)

## Computing the accuracy score
gb_accuracy = accuracy_score(Y_test, gb_labels)
print('The accuracy score for the first gradient boosting model is', gb_accuracy)   

In [None]:
## Average recall and accuracy of random forest models
RF_recall_avg = np.mean(RF_recall)
print('The average recall score for the random forest model is', RF_recall_avg)
RF_accuracy_avg = np.mean(RF_accuracy)
print('The average accuracy score for the random forest model is', RF_accuracy_avg)

In [None]:
## Average recall and accuracy of adaboost models
ada_recall_avg = np.mean(ada_recall)
print('The average recall score for the adaboost model is', ada_recall_avg)
ada_accuracy_avg = np.mean(ada_accuracy)
print('The average accuracy score for the adaboost model is', ada_accuracy_avg)

In [None]:
## Average recall and accuracy of gradient boosting models
gb_recall_avg = np.mean(gb_recall)
print('The average recall score for the gradient boosting model is', ada_recall_avg)
gb_accuracy_avg = np.mean(gb_accuracy)
print('The average accuracy score for the gradient boosting model is', ada_accuracy_avg)