 # Predicting Diabetes

In [2]:
from pathlib import Path
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

In [3]:
# data = Path('../Resources/diabetes.csv')

start_data = pd.read_csv('Resources/cleaned.csv')
start_data.head()
start_data.columns

Index(['Title', 'Rated', 'Released', 'Runtime', 'Genre', 'Director', 'Actors',
       'Language', 'Country', 'Awards', 'Metascore', 'imdbRating', 'imdbVotes',
       'imdbID', 'BoxOffice', 'Rotten_Tomatoes', 'R_Year', 'R_Month', 'R_Day',
       'Rounded Rating', 'Good_Movie', 'DRAMA', 'HORROR', 'MUSIC', 'THRILLER',
       'DOCUMENTARY', 'ANIMATION', 'ACTION', 'ADVENTURE', 'BIOGRAPHY',
       'COMEDY', 'FANTASY', 'MYSTERY', 'ROMANCE', 'SCI-FI', 'CRIME', 'FAMILY',
       'HISTORY', 'SPORT', 'WAR', 'MUSICAL', 'WESTERN', 'NEWS', 'REALITY-TV',
       'Unnamed: 44', 'GAME-SHOW', 'TALK-SHOW', 'BoxOffice_Inf_Adj'],
      dtype='object')

In [17]:
start_data[[ 'imdbRating','Good_Movie']][start_data['Good_Movie'] == True]

Unnamed: 0,imdbRating,Good_Movie
2,7.4,True
28,7.7,True
44,8.3,True
49,7.5,True
56,7.5,True
...,...,...
17939,7.5,True
17949,7.6,True
17955,8.1,True
17967,7.7,True


In [3]:
model_data = start_data[start_data['R_Year'] >= 2010]

model_data = model_data[['Director','Rated','R_Year','R_Month','R_Day','Runtime','Good_Movie',
       'DRAMA', 'HORROR', 'MUSIC', 'THRILLER',
       'DOCUMENTARY', 'ANIMATION', 'ACTION', 'ADVENTURE', 'BIOGRAPHY',
       'COMEDY', 'FANTASY', 'MYSTERY', 'ROMANCE', 'SCI-FI', 'CRIME', 'FAMILY',
       'HISTORY', 'SPORT', 'WAR', 'MUSICAL', 'WESTERN', 'NEWS']].copy()


model_data = model_data.fillna(0)
model_data

Unnamed: 0,Director,Rated,R_Year,R_Month,R_Day,Runtime,Good_Movie,DRAMA,HORROR,MUSIC,...,ROMANCE,SCI-FI,CRIME,FAMILY,HISTORY,SPORT,WAR,MUSICAL,WESTERN,NEWS
0,Orson Welles,R,2018.0,11.0,2.0,122,False,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,André Szöts,Not Rated,2021.0,1.0,8.0,74,False,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,John Bailey,PG-13,2019.0,11.0,13.0,101,True,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,John Mulholland,Unrated,2013.0,9.0,27.0,120,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Stuart Paul,TV-MA,2018.0,11.0,6.0,104,False,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17973,Joe Begos,Not Rated,2020.0,2.0,14.0,92,False,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17974,Geeta Malik,PG-13,2022.0,2.0,3.0,101,False,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17975,Robert Fernandez,PG,2019.0,4.0,18.0,108,False,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
17976,Michael Dowse,TV-MA,2020.0,4.0,3.0,88,False,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
model_data = pd.get_dummies(model_data, columns=['Rated','R_Year','R_Month','R_Day','Director'])
df = model_data.copy()

 ## Separate the Features (X) from the Target (y)

In [5]:
y = df["Good_Movie"]
X = df.drop(columns="Good_Movie")

 ## Split our data into training and testing

In [6]:


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(13482, 12451)

 ## Create a Logistic Regression Model

In [7]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=1000,
                                random_state=1)
classifier.fit(X_train, y_train)

 ## Fit (train) or model using the training data

 ## Score the model using the test data

In [8]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.908989764129951
Testing Data Score: 0.8696039163328882


 ## Make predictions

In [9]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
5,False,False
6,False,False
7,False,False
8,False,False
9,False,False


## Calculate the Accuracy Score

In [19]:
print(results.count())
print(results[results['Actual'] == True])

Prediction    4494
Actual        4494
dtype: int64
      Prediction  Actual
10         False    True
16         False    True
23         False    True
28         False    True
36         False    True
...          ...     ...
4425        True    True
4432       False    True
4465        True    True
4474       False    True
4480       False    True

[600 rows x 2 columns]


In [10]:
from sklearn.metrics import accuracy_score
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.8696039163328882

In [11]:
confusion_matrix(y_test, predictions)

array([[3822,   72],
       [ 514,   86]])

In [12]:
target_names = ["Training", "Test"]
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

    Training       0.88      0.98      0.93      3894
        Test       0.54      0.14      0.23       600

    accuracy                           0.87      4494
   macro avg       0.71      0.56      0.58      4494
weighted avg       0.84      0.87      0.84      4494

