In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, roc_auc_score

In [2]:
# Import the data
df = pd.read_csv("https://static.bc-edx.com/ai/ail-v-1-0/m14/lesson_1/datasets/crowdfunding-data-imbalanced.csv")
df.head()

Unnamed: 0,goal,backers_count,outcome
0,3700,24,0
1,7100,31,0
2,8300,57,0
3,79705,1254,0
4,5320,336,0


In [3]:
# Show the total number of positive and negative outcomes
df['outcome'].value_counts()

1    565
0    111
Name: outcome, dtype: int64

In [4]:
# Create an X and y variable
X = df.drop(columns=['outcome'])
y = df['outcome']

In [5]:
# Create a Logistic Regression Model
classifier = LogisticRegression()

# Fit the model to the training data
classifier.fit(X, y)

# Calculate the accuracy of the model
classifier.score(X, y)

0.8727810650887574

In [6]:
# Make predictions on the test data
predictions = classifier.predict(X)

# Create a confusion matrix
print(confusion_matrix(y, predictions, labels = [1,0]))

[[553  12]
 [ 74  37]]


In [7]:
# Create a classification report
print(classification_report(y, predictions, labels = [1, 0]))

              precision    recall  f1-score   support

           1       0.88      0.98      0.93       565
           0       0.76      0.33      0.46       111

    accuracy                           0.87       676
   macro avg       0.82      0.66      0.70       676
weighted avg       0.86      0.87      0.85       676



In [8]:
# Calculate the balanced accuracy score
print(balanced_accuracy_score(y, predictions))

0.656047197640118


In [9]:
# Predict values with probabilities
pred_probas = classifier.predict_proba(X)

# Print the probabilities
pred_probas

array([[0.1448087 , 0.8551913 ],
       [0.1616855 , 0.8383145 ],
       [0.1609259 , 0.8390741 ],
       ...,
       [0.10526033, 0.89473967],
       [0.10196936, 0.89803064],
       [0.10738461, 0.89261539]])

In [10]:
# Each prediction includes a prediction for both the 0 class and the 1 class
# We only need the predictions for the 1 class; use a list comprehension to 
# gather the second value from each list

pred_probas_firsts = [prob[1] for prob in pred_probas]

# Print the first 5 probabilities
pred_probas_firsts[0:5]

[0.8551912971691843,
 0.8383144957302657,
 0.8390741025091597,
 0.7717647199981804,
 0.9160796479655225]

In [11]:
# Calculate the roc_auc_score
print(roc_auc_score(y, pred_probas_firsts))

0.8603364426373276
