In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

**Open and look through the data file. Path to the file:datasets/users_behavior.csv**

In [2]:
df = pd.read_csv('/datasets/users_behavior.csv')

In [4]:
df.head()

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0


**Split the source data into a training set, a validation set, and a test set.**

Now we need to split data into training set, a validation set, and a test set. They should be in 3:1:1 ratio. so first I will split the data into 60:40 and then the 40% part into 50:50 to get validation and test sets.

In [5]:
df_train, df_valid_test = train_test_split(df, test_size=0.4, random_state=12345)

In [6]:
df_valid, df_test = train_test_split(df_valid_test, test_size=0.5, random_state=12345)

In [7]:
features_train = df_train.drop(['is_ultra'],axis=1)
target_train = df_train['is_ultra']
features_valid = df_valid.drop(['is_ultra'],axis=1)
target_valid = df_valid['is_ultra']
features_test = df_test.drop(['is_ultra'],axis=1)
target_test = df_test['is_ultra']

**Investigate the quality of different models by changing hyperparameters. Briefly describe the findings of the study.**

First lets try the DecisionTreeClassifier model

In [8]:
for i in range(1,6):
    model = DecisionTreeClassifier(random_state=12345,max_depth = i)
    model.fit(features_train,target_train)
    predictions_valid = model.predict(features_valid)
    
    print("max_depth =", i, ": ", end='')
    print(accuracy_score(target_valid, predictions_valid))

max_depth = 1 : 0.7542768273716952
max_depth = 2 : 0.7822706065318819
max_depth = 3 : 0.7853810264385692
max_depth = 4 : 0.7791601866251944
max_depth = 5 : 0.7791601866251944


I can conclude that I can get the highest accuracy with max_depth parameter set to 3, now lets investigate the RandomForestClassifier

In [9]:
for i in range(10,101,10):
    model = RandomForestClassifier(random_state=12345,n_estimators = i,max_depth=10)
    model.fit(features_train,target_train)
    predictions_valid = model.predict(features_valid)
    
    print("n_estimators =", i, ": ", end='')
    print(accuracy_score(target_valid, predictions_valid))

n_estimators = 10 : 0.7916018662519441
n_estimators = 20 : 0.7916018662519441
n_estimators = 30 : 0.7947122861586314
n_estimators = 40 : 0.7962674961119751
n_estimators = 50 : 0.7931570762052877
n_estimators = 60 : 0.7978227060653188
n_estimators = 70 : 0.7947122861586314
n_estimators = 80 : 0.7962674961119751
n_estimators = 90 : 0.7947122861586314
n_estimators = 100 : 0.7947122861586314


I can conclude that I can get the highest accuracy with n_estimators parameter set to 50, now lets investigate the LogisticRegression

In [10]:
model = LogisticRegression(random_state=12345)
model.fit(features_train,target_train);
predictions_valid = model.predict(features_valid)
print(accuracy_score(target_valid, predictions_valid))

0.7589424572317263


Out of all the models I achieved the highest accuracy on the validation dataset with the RandomForestClassifier with n_estimators parameter set to 70 and max depth set to 10

**Check the quality of the model using the test set.**

In [11]:
model = RandomForestClassifier(random_state=12345,n_estimators = 70,max_depth=10)
model.fit(features_train,target_train);
predictions_test = model.predict(features_test)
print(accuracy_score(target_test, predictions_test))

0.8040435458786936


My model got 80% accuracy on the test set

**Additional task: sanity check the model. This data is more complex than what you’re used to working with, so it's not an easy task. We'll take a closer look at it later.**

In [37]:
df['is_ultra'].mean()

0.30647168637212197

Here I can see that If we try to guess the is_ultra parameter on random we would get 30% accuracy if we just guessed 1 in every case and 70% accuracy if we guessed 0, this means that my model improves the accuracy by 10%