**Project description**

**Objective**: develop a model with the highest possible accuracy that would analyze subscribers' behavior and recommend one of Megaline's newer plans: Smart or Ultra.

- the data is already pre-processed
- accuracy threshold 0.75

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model  import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('users_behavior.csv')

In [3]:
display(df.head(5))
df.info()

Unnamed: 0,calls,minutes,messages,mb_used,is_ultra
0,40.0,311.9,83.0,19915.42,0
1,85.0,516.75,56.0,22696.96,0
2,77.0,467.66,86.0,21060.45,0
3,106.0,745.53,81.0,8437.39,1
4,66.0,418.74,1.0,14502.75,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3214 entries, 0 to 3213
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   calls     3214 non-null   float64
 1   minutes   3214 non-null   float64
 2   messages  3214 non-null   float64
 3   mb_used   3214 non-null   float64
 4   is_ultra  3214 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 125.7 KB


In [4]:
target = df['is_ultra']
features = df.drop('is_ultra', axis = 1)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
target_train, target_rest, features_train, features_rest = train_test_split(target, features, test_size = 0.4, random_state = 1)

In [7]:
features_valid, features_test, target_valid, target_test = train_test_split(features_rest, target_rest, test_size = 0.5, random_state = 2)

In [8]:
print(target_train.size/len(df))
print(target_valid.size/len(df))
target_test.size/len(df)

0.5998755444928439
0.2000622277535781


0.2000622277535781

**LogisticRegression**

In [19]:
model = LogisticRegression(random_state = 123)
model.fit(features_train, target_train)
predictions_valid = model.predict(features_valid)
print(f'Accuracy score logistic regression: {round(accuracy_score(target_valid, predictions_valid), 2)}')



 Accuracy score logistic regression: 0.74


**RandomForestClassifier**

In [21]:
best_n_est = 0
accuracy = 0
depth = 0
for est in range(1,15):
    for depth in range(1, 7):
        model = RandomForestClassifier(n_estimators = est, max_depth = depth, random_state =12)
        model.fit(features_train, target_train)
        predictions_valid = model.predict(features_valid)
        accuracy_n = accuracy_score(target_valid, predictions_valid)
        if accuracy_n > accuracy:
            accuracy = accuracy_n
            best_n_est = est
            
print(f'Accuracy score random forest classifier: {round(accuracy, 2)} with {best_n_est} estimators and {depth} depth')

Accuracy score random forest classifier: 0.8 with 7 estimators and 6 depth


**DecisionTreeClassifier**

In [23]:
accuracy = 0
best_depth = 0
for depth in range(1, 7):
    model = DecisionTreeClassifier(max_depth = depth, random_state = 1234)
    model.fit(features_train, target_train)
    predictions_valid = model.predict(features_valid)
    accuracy_d = accuracy_score(target_valid, predictions_valid)

    if accuracy_d > accuracy:
        accuracy = accuracy_d
        best_depth = depth
    
print(f'Accuracy score decission tree classifier: {round(accuracy, 2)} with {best_depth} depth')

Accuracy score decission tree classifier: 0.78 with 3 depth


We gonna choose RandomForestClassifier with n_estimators = 7 and max_depth = 6

In [17]:
model =  RandomForestClassifier(n_estimators = 7, max_depth = 6, random_state =12)
model.fit(features_train, target_train)
predictions_test =  model.predict(features_test)
accuracy_score(predictions_test, target_test)

0.8040435458786936

In [25]:
df.is_ultra.value_counts(normalize = True)

0    0.693528
1    0.306472
Name: is_ultra, dtype: float64

**Conclusion** The model is not very accurate