# Final Project
## Will Salvi
### 13 December 2023

***

#### Import packages

In [149]:
import pandas as pd
import numpy as np
import altair as alt
import streamlit as st
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, confusion_matrix

***

#### Q1.  Read in the data, call the dataframe "s"  and check the dimensions of the dataframe

In [150]:
s = pd.read_csv("social_media_usage.csv",
                na_values="UNKNOWN")
s.shape

(1502, 89)

***

#### Q2.  Define a function called clean_sm that takes one input, x, and uses `np.where` to check whether x is equal to 1. If it is, make the value of x = 1, otherwise make it 0. Return x. Create a toy dataframe with three rows and two columns and test your function to make sure it works as expected

In [151]:
def clean_sm(x):
    x = np.where(x == 1, 1, 0)
    return x

data = {
    "col1":  ["a", "b", "c"],
    "col2":  [1, 2, 5]
}

toy_df = pd.DataFrame(data)
toy_df

Unnamed: 0,col1,col2
0,a,1
1,b,2
2,c,5


In [152]:
toy_df["col3"] = toy_df["col2"].apply(clean_sm)
toy_df

Unnamed: 0,col1,col2,col3
0,a,1,1
1,b,2,0
2,c,5,0


***

#### Q3.  Create a new dataframe called "ss". The new dataframe should contain a target column called sm_li which should be a binary variable ( that takes the value of 1 if it is 1 and 0 otherwise (use clean_sm to create this) which indicates whether or not the individual uses LinkedIn, and the following features: income (ordered numeric from 1 to 9, above 9 considered missing), education (ordered numeric from 1 to 8, above 8 considered missing), parent (binary), married (binary), female (binary), and age (numeric, above 98 considered missing). Drop any missing values. Perform exploratory analysis to examine how the features are related to the target.

In [153]:
def gender_bender(x):
    x = np.where(x == 2, 1, 0)
    return x

ss = s[["income", "educ2", "par", "marital", "gender", "age", "web1h"]]

ss = ss[ss["web1h"] <= 2]
ss["web1h"] = ss["web1h"].apply(clean_sm)
ss.rename(columns = {"web1h" : "sm_li"}, inplace = True)

#make gender binary and rename as female
ss = ss[ss["gender"] <= 2] 
ss["gender"] = ss["gender"].apply(gender_bender)
ss.rename(columns = {"gender" : "female"}, inplace = True)

#make parent binary
ss = ss[ss["par"] <= 2]
ss["par"] = ss["par"].apply(clean_sm)

#make marital binary
ss = ss[ss["marital"] <= 6]
ss["marital"] = ss["marital"].replace(4,1)
ss["marital"] = ss["marital"].replace([2, 3, 5, 6],0)

#removing missing values for age, education, and income
ss = ss[ss["age"] <= 97]
ss = ss[ss["educ2"] <= 8]
ss = ss[ss["income"] <= 9]
ss.corr()

Unnamed: 0,income,educ2,par,marital,female,age,sm_li
income,1.0,0.437344,0.088159,0.362896,-0.097925,0.017725,0.323963
educ2,0.437344,1.0,0.038354,0.186022,0.055788,0.11411,0.305474
par,0.088159,0.038354,1.0,0.242816,0.031536,-0.268004,0.101478
marital,0.362896,0.186022,0.242816,1.0,-0.041619,0.164432,0.077338
female,-0.097925,0.055788,0.031536,-0.041619,1.0,0.094751,-0.073074
age,0.017725,0.11411,-0.268004,0.164432,0.094751,1.0,-0.162071
sm_li,0.323963,0.305474,0.101478,0.077338,-0.073074,-0.162071,1.0


***

#### Q4.  Create a target vector (y) and feature set (X)

In [154]:
x_ss = ss.drop("sm_li", axis = 1)
x_ss.shape

(1240, 6)

In [155]:
y_ss = ss["sm_li"]
y_ss.shape

(1240,)

***

#### Q5.  Split the data into training and test sets. Hold out 20% of the data for testing. Explain what each new object contains and how it is used in machine learning

In [156]:
X_train, X_test, y_train, y_test = train_test_split(x_ss,
                                                    y_ss,
                                                    stratify = y_ss,
                                                    test_size = 0.2,
                                                    random_state = 987)

#### This creates train and test sets to use and find the best factors that will affect our target variable of sm_li, which is if someone uses Linkdin or not.  We will train the model on the 80% of the data (X_train & y_train) so that it can learn the significance of each factor.  We will then use the remaining 20% of the data, our test sets (X_test & y_test) to see if the model we have chosen works as expected.  The y datasets (y_test & y_train) contain only the sm_li column, whereas the x datasets (X_test & x_train) contain all other columns from out dataset.

***

#### Q6.  Instantiate a logistic regression model and set class_weight to balanced. Fit the model with the training data.

In [157]:
lr = LogisticRegression(class_weight = "balanced") #initiate algorithm

In [158]:
lr.fit(X_train, y_train) #fit algorithm to training data

***

#### Q7.  Evaluate the model using the testing data. What is the model accuracy for the model? Use the model to make predictions and then generate a confusion matrix from the model. Interpret the confusion matrix and explain what each number means.

In [159]:
y_pred = lr.predict(X_test) #make predictions using the model and the testing data

In [160]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy is {accuracy:.2f}")

Model accuracy is 0.71


In [161]:
confusion_matrix(y_test, y_pred)

array([[116,  49],
       [ 22,  61]], dtype=int64)

There are 116 true negatives, instances correctly predicted as negative.  There are 49 false positives, negatives incorrectly predicted as positive.  There are 22 false negatives, positives incorrectly predited as negative.  Lastly there are 61 true positives, positives correctly predicted as positive.

***

#### Q8.  Create the confusion matrix as a dataframe and add informative column names and index names that indicate what each quadrant represents

In [162]:
pd.DataFrame(confusion_matrix(y_test, y_pred),
            columns = ["Predicted negative", "Predicted positive"],
            index = ["Actual negative", "Actual positive"]).style.background_gradient(cmap = "PiYG")

Unnamed: 0,Predicted negative,Predicted positive
Actual negative,116,49
Actual positive,22,61


***

#### Q9.  Aside from accuracy, there are three other metrics used to evaluate model performance: precision, recall, and F1 score. Use the results in the confusion matrix to calculate each of these metrics by hand. Discuss each metric and give an actual example of when it might be the preferred metric of evaluation. After calculating the metrics by hand, create a classification_report using sklearn and check to ensure your metrics match those of the classification_report.

#### Precision measures how accurate our positive predictions are and is useful when having false positives are significantly impactful.  Cancer diagnosis is a good example.

#### Recall measures the ability of the model to identify the true positive rate and is best used when false negatives are too costly.  Fruad detection is one example.

#### F1 F1 score is the weighted average of recall and precision  It is useful when class distribution is imbalanced.

In [167]:
precision = 61 / (61 + 49)
recall = 61 / (61 + 22)
f1 =  2 * (precision * recall) / (precision + recall)

print(f"Our model has a prescision of {precision:.2f} with a recall of {recall:.2f} and a F1 score of {f1:.2f}")

Our model has a prescision of 0.55 with a recall of 0.73 and a F1 score of 0.63


In [168]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.70      0.77       165
           1       0.55      0.73      0.63        83

    accuracy                           0.71       248
   macro avg       0.70      0.72      0.70       248
weighted avg       0.74      0.71      0.72       248



***

#### Q10.  Use the model to make predictions. For instance, what is the probability that a high income (e.g. income=8), with a high level of education (e.g. 7), non-parent who is married female and 42 years old uses LinkedIn? How does the probability change if another person is 82 years old, but otherwise the same?

In [169]:
person1 = [8, 7, 0, 1, 1, 42]

predicted_class = lr.predict([person1])
probs = lr.predict_proba([person1])

print(f"Predicted class: {predicted_class[0]}") # 0=not a Linkdin user, 1=Linkdin user
print(f"Probability that this person is Linkdin user: {probs[0][1]}")


Predicted class: 1
Probability that this person is Linkdin user: 0.727926074993896




In [170]:
person2 = [8, 7, 0, 1, 1, 82]

predicted_class = lr.predict([person2])
probs = lr.predict_proba([person2])

print(f"Predicted class: {predicted_class[0]}") # 0=not a Linkdin user, 1=Linkdin user
print(f"Probability that this person is Linkdin user: {probs[0][1]}")

Predicted class: 0
Probability that this person is Linkdin user: 0.47994534192441385




####  Changing the age has an inverse affect on Linkdin user status.  An older person is less likely to be a Linkdin user.