In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import warnings

In [2]:
# Supress deprecation & future warnings since we're instructed to use older sklearn library
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

## Import and Explore data:

In [3]:
# Import data from csv file
try:
    df_users_behavior = pd.read_csv('./users_behavior.csv')
except:
    print("File(s) not found, please check file path(s) are correct")
df_users_behavior.head(5)

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0


In [4]:
df_users_behavior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   float64
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   float64
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 125.7 KB


In [5]:
df_users_behavior.duplicated().sum()

0

In [6]:
# Cast calls and messages values to int
df_users_behavior['calls'] = df_users_behavior['calls'].astype('int64')
df_users_behavior['messages'] = df_users_behavior['messages'].astype('int64')

In [7]:
# Verify that dtype changes took affect
df_users_behavior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   int64  
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   int64  
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(2), int64(3)
memory usage: 125.7 KB


In [8]:
df_users_behavior.head(5)

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40,311.9,83,19915.42,0
1,85,516.75,56,22696.96,0
2,77,467.66,86,21060.45,0
3,106,745.53,81,8437.39,1
4,66,418.74,1,14502.75,0


## Split data into training, validation,  test sets:

In [9]:
# Define feature and target
features = df_users_behavior.drop(['is_ultra'], axis=1)
target = df_users_behavior['is_ultra']

# Use train_test_split() to split dataset into 60% training, 40% for validation & testing
features_train, features_valid_test, target_train, target_valid_test = train_test_split(features, target, test_size=0.4, random_state=12345)

In [10]:
features_train

Unnamed: 0,calls,minutes,messages,mb_used
3027,60,431.56,26,14751.26
434,33,265.17,59,17398.02
1226,52,341.83,68,15462.38
1054,42,226.18,21,13243.48
1842,30,198.42,0,8189.53
...,...,...,...,...
2817,12,86.62,22,36628.85
546,65,458.46,0,15214.25
382,144,906.18,0,25002.44
2177,38,301.27,37,28914.24


In [11]:
target_train

3027    0
434     0
1226    0
1054    0
1842    0
       ..
2817    1
546     1
382     1
2177    1
482     1
Name: is_ultra, Length: 1928, dtype: int64

In [12]:
# Split validation & testing in half, so each is 20% of original dataset

features_valid, features_test, target_valid, target_test = train_test_split(features_valid_test, target_valid_test, test_size=0.5, random_state=12345)

In [13]:
features_valid

Unnamed: 0,calls,minutes,messages,mb_used
1386,92,536.96,18,20193.90
3124,40,286.57,17,17918.75
1956,81,531.22,56,17755.06
2286,67,460.76,27,16626.26
3077,22,120.09,16,9039.57
...,...,...,...,...
1999,56,398.45,4,23682.94
1023,76,601.10,0,17104.36
748,81,525.97,15,18878.91
1667,10,63.03,0,2568.00


In [14]:
features_test

Unnamed: 0,calls,minutes,messages,mb_used
160,61,495.11,8,10891.23
2498,80,555.04,28,28083.58
1748,87,697.23,0,8335.70
1816,41,275.80,9,10032.39
1077,60,428.49,20,29389.52
...,...,...,...,...
2401,55,446.06,79,26526.28
2928,102,742.65,58,16089.24
1985,52,349.94,42,12150.72
357,39,221.18,59,17865.23


## Classification Models:

### Decision Tree Classifier Model:

In [15]:
# Decision Tree Classifier without hyperparameter tuning
model = DecisionTreeClassifier(random_state=12345)

model.fit(features_train, target_train)

test_predictions = model.predict(features_test)

print('Accuracy')
print('Test set:', model.score(features_test,target_test))
print('Validation set:', model.score(features_valid,target_valid))

Accuracy
Test set: 0.7309486780715396
Validation set: 0.713841368584759


In [16]:
# Tune hyperparameter max_depth
best_model = None
best_result = 0
best_depth = 0
for depth in range(1, 10):
    model = DecisionTreeClassifier(random_state=12345, max_depth=depth) # create a model with the given depth
    model.fit(features_train, target_train) # train the model
    result = model.score(features_test,target_test) # calculate the accuracy
    if result > best_result:
        best_model = model
        best_result = result
        best_depth = depth

print("Best model has a max depth of:", best_depth)
print("Accuracy of the best model on test set:", best_result)
print("Accuracy of the best model on validation set:", best_model.score(features_valid,target_valid))

Best model has a max depth of: 7
Accuracy of the best model on test set: 0.7993779160186625
Accuracy of the best model on validation set: 0.7822706065318819


### Random Forest Classifier Model:

In [17]:
# Random Forest Classifier without hyperparameter tuning
model = RandomForestClassifier(random_state=12345)
model.fit(features_train,target_train)

print("Accuracy on test set:", model.score(features_test,target_test))
print("Accuracy on validation set:", model.score(features_valid,target_valid))

Accuracy on test set: 0.7807153965785381
Accuracy on validation set: 0.7853810264385692


In [18]:
# Tune hyperparameter n_estimators
best_model = None
best_score = 0
best_est = 0
for est in range(1, 50): # choose hyperparameter range
    model = RandomForestClassifier(random_state=12345, n_estimators=est) # set number of trees
    model.fit(features_train,target_train) # train model on training set
    score = model.score(features_test,target_test) # calculate accuracy score on validation set
    if score > best_score:
        best_score = score
        best_est = est
        best_model = model
    # Print loading pattern
    print("Tuning in progress" + "."*(est%4), end="\r")    

print("Accuracy of the best model on the test set (n_estimators = {}): {}".format(best_est, best_score))
print("Accuracy of the best model on validation set:", best_model.score(features_valid,target_valid))

Accuracy of the best model on the test set (n_estimators = 48): 0.7962674961119751
Accuracy of the best model on validation set: 0.7916018662519441


### Logistic Regression Model:

In [19]:
# Logistic Regression Classifier 
model =  LogisticRegression(random_state=12345, solver='liblinear') # initialize logistic regression constructor with parameters random_state=12345 and solver='liblinear'
model.fit(features_train, target_train)   # train model on training set
score_test = model.score(features_test,target_test) # calculate accuracy score on training set
score_valid = model.score(features_valid,target_valid) # calculate accuracy score on validation set

print("Accuracy on test set:", score_test,)
print("Accuracy on validation set:", score_valid,)

Accuracy on test set: 0.7402799377916018
Accuracy on validation set: 0.7573872472783826


## Sanity Checks:

### Random Classifier:

In [20]:
# Generate random predictions
predictions_random = pd.Series(np.random.randint(0, 2, features_test.shape[0]))

# Calculate the accuracy of the random classifier
random_accuracy = accuracy_score(target_test, predictions_random)
print(f"Random classifier accuracy: {random_accuracy}")

Random classifier accuracy: 0.5038880248833593


### Majority Class Baseline:

In [21]:
# Find the majority class in the training set
majority_class = target_train.mode()[0]

# Predict the majority class for all test observations
predictions_majority = [majority_class] * len(target_test)

# Calculate the accuracy of the majority class baseline
majority_class_accuracy = accuracy_score(target_test, predictions_majority)
print(f"Majority class accuracy: {majority_class_accuracy}")

Majority class accuracy: 0.6842923794712286


## Conclusions:

- Best model is the Random Forest with an accuracy of 0.796 and 0.791 on the test and validation sets respectively with an n_estimator of 48. 
- Second best model is the Decision Tree with an accuracy of 0.799 and 0.78 on the test and validation sets respectively with a max depth of 7.
- Logistic regression performed the worst with an accuracy of 0.74 and 0.75 on the test and validation sets respectively. 
- Decision Tree had the largest variation in accuracy between test and validation sets.
- All three models performed better than the random and majority class classifiers, which is a good sanity check that models are working.