# Machine Learning in Python

## Arul Nigam

### March 24, 2024

In [3]:
# Step 1

import pandas as pd

# Read in data
s = pd.read_csv('social_media_usage.csv')

# Check the dimensions of the dataframe
print(s.shape)


(1502, 89)


In [4]:
# Step 2

import numpy as np

def clean_sm(x):
    return np.where(x == 1, 1, 0)

# toy dataframe
toy_df = pd.DataFrame({ 
    'A': [1, 2, 3],
    'B': [1, 1, 1]
})

# reference: w3schools.com/python/pandas/ref_df_applymap.asp
toy_df = toy_df.applymap(clean_sm) 
print(toy_df)


   A  B
0  1  1
1  0  1
2  0  1


In [5]:
# Step 3

# redefine clean_sm
def clean_sm(x): 
    return 1 if x == 1 else 0

# clean and recode 
s['sm_li'] = s['web1h'].apply(clean_sm)
s['income'] = s['income'].replace([98, 99], np.nan)
s['education'] = s['educ2'].replace([98, 99], np.nan)
s['parent'] = s['par'].apply(lambda x: 1 if x == 1 else 0)
s['married'] = s['marital'].apply(lambda x: 1 if x == 1 else 0)
s['female'] = s['gender'].apply(lambda x: 1 if x == 2 else 0)
s['age'] = s['age'].where(s['age'] <= 98, np.nan) # traditional .replace doesn't work for open range, so I referenced: note.nkmk.me/en/python-numpy-where/

ss = s[['sm_li', 'income', 'education', 'parent', 'married', 'female', 'age']].dropna() # create ss

# perform analysis
print(ss.describe())

             sm_li       income    education       parent      married  \
count  1440.000000  1440.000000  1440.000000  1440.000000  1440.000000   
mean      0.322917     6.382639     5.144444     0.236806     0.486111   
std       0.467754     2.688374     1.871470     0.425270     0.499981   
min       0.000000     1.000000     1.000000     0.000000     0.000000   
25%       0.000000     4.000000     3.000000     0.000000     0.000000   
50%       0.000000     7.000000     5.000000     0.000000     0.000000   
75%       1.000000     9.000000     6.000000     0.000000     1.000000   
max       1.000000    10.000000     8.000000     1.000000     1.000000   

            female          age  
count  1440.000000  1440.000000  
mean      0.418750    51.892361  
std       0.493526    18.725483  
min       0.000000    18.000000  
25%       0.000000    36.000000  
50%       0.000000    54.000000  
75%       1.000000    66.000000  
max       1.000000    98.000000  


In [6]:
# Step 4

y = ss['sm_li']
X = ss.drop('sm_li', axis=1)

In [7]:
# Step 5

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=316203)

# X_train: 80% of the dataset, including the features used for training the model.

# X_test: 20% of the dataset, including the features used for testing the model.

# y_train: X_train's target variable - the outcomes that we will actual use

# y_test: X_test's target variable - we will use these coutcomes to evaluate the model

In [8]:
# Step 6

from sklearn.linear_model import LogisticRegression

# instantiate logistic model
model = LogisticRegression(class_weight='balanced')

# fit model
model.fit(X_train, y_train)

In [9]:
# Step 7

from sklearn.metrics import accuracy_score, confusion_matrix

# apply the model
y_pred = model.predict(X_test)

# output accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# generate confusion matrix
conf_mat = confusion_matrix(y_test, y_pred)
print(conf_mat)

# explanation of my results:

# accuracy
# proportion of total predictions that were correct
# formula: (TP+TN)/(TP+TN+FP+FN)

# confusion matrix 
# 128 = true negatives (correctly predicted that 128 people do not use LinkedIn)
# 72 = false positives / Type I error (predicted that 72 people use LinkedIn, but they actually do not)
# 22 = false negatives / Type II error (predicted that 22 people do not use LinkedIn, butn they actually do)
# 66 = true positives (correctly predicted that 66 people use LinkedIn)


Accuracy: 0.6840277777777778
[[130  73]
 [ 18  67]]


In [10]:
# Step 8

conf_mat_df = pd.DataFrame(conf_mat, 
                              index=['Actual Negative', 'Actual Positive'], 
                              columns=['Predicted Negative', 'Predicted Positive'])
print(conf_mat_df)

                 Predicted Negative  Predicted Positive
Actual Negative                 130                  73
Actual Positive                  18                  67


In [11]:
# Step 9

# precision = TP / (TP + FP) = 66 / (66 + 72) = 0.478 
# precision measures how accurate positive predictions are, so high precision indicates a low rate of false positives
# this is a valuable metric for detecting spam emails, where there is a high cost to incorrectly classifying a message as spam

# recall = TP / (TP + FN) = 66 / (66 + 22) = 0.75
# recall measures how well positives are identified. Therefore, high recall indicates a low rate of false negatives
# this metric should be employed where there is a high cost to failing to identify positive cases, such as virus tests

# f1 = 2 * (precision * recall) / (precision + recall) = 2 * (0.478 * 0.75) / (0.478 + 0.75) = 0.584
# the F1 score is the weighted average of precision and recall, accounting for both false positives and false negatives
# social media trust & safety teams may employ the f1 score to proeprly identify harmful content but also avoid accidentally flagging posts that are not harmful

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.64      0.74       203
           1       0.48      0.79      0.60        85

    accuracy                           0.68       288
   macro avg       0.68      0.71      0.67       288
weighted avg       0.76      0.68      0.70       288



In [22]:
# Step 10

feature_names = ['income', 'education', 'parent', 'married', 'female', 'age']

# profile: high income (e.g. income=8), with a high level of education (e.g. 7), non-parent who is married female and 42 years old uses LinkedIn
example1_features = pd.DataFrame([[8, 7, 0, 1, 2, 42]], columns=feature_names) 
probability_1 = model.predict_proba(example1_features)[0][1]
print(f"Probability of LinkedIn use: {probability_1:.2f}")

# profile: otherwise identical but 82 year old uses LinkedIn
example2_features = pd.DataFrame([[8, 7, 0, 1, 2, 82]], columns=feature_names) 
probability_2 = model.predict_proba(example2_features)[0][1]
print(f"Probability of LinkedIn use: {probability_2:.2f}")

# profile: a top-level income, high school educated, parent who is married, male, and 35 years old
example3_features = pd.DataFrame([[9, 3, 1, 1, 1, 35]], columns=feature_names) 
probability_3 = model.predict_proba(example3_features)[0][1]
print(f"Probability of LinkedIn use: {probability_3:.2f}")

# profile: otherwise identical but postgrad educated person
example4_features = pd.DataFrame([[9, 8, 1, 1, 1, 35]], columns=feature_names) 
probability_4 = model.predict_proba(example4_features)[0][1]
print(f"Probability of LinkedIn use: {probability_4:.2f}")

# profile: $100k income, no formal education, non-parent, never married, female, 49
example5_features = pd.DataFrame([[8, 1, 0, 6, 2, 25]], columns=feature_names) 
probability_5 = model.predict_proba(example5_features)[0][1]
print(f"Probability of LinkedIn use: {probability_5:.2f}")

Probability of LinkedIn use: 0.65
Probability of LinkedIn use: 0.37
Probability of LinkedIn use: 0.48
Probability of LinkedIn use: 0.87
Probability of LinkedIn use: 0.24
