# Lab | Making predictions with logistic regression

In this lab, you will be using the [Sakila](https://dev.mysql.com/doc/sakila/en/) database of movie rentals.

In order to optimize our inventory, we would like to know which films will be rented next month and we are asked to create a model to predict it.

### Instructions

1. Create a query or queries to extract the information you think may be relevant for building the prediction model. It should include some film features and some rental features. Use the data from 2005.
2. Create a query to get the list of films and a boolean indicating if it was rented last month (August 2005). This would be our target variable.
3. Read the data into a Pandas dataframe.
4. Analyze extracted features and transform them. You may need to encode some categorical variables, or scale numerical variables.
5. Create a logistic regression model to predict this variable from the cleaned data.
6. Evaluate the results.


In [1]:
# Import libraries
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pymysql
from sqlalchemy import create_engine
import getpass



In [2]:
pass_ = getpass.getpass('Please input your connection credentials:')
connection_string = 'mysql+pymysql://root:'+ pass_ + '@localhost/sakila'
sakila_engine = create_engine(connection_string)



In [30]:
# For this problem we will likely need data from the rental, inventory and film tables of the sakila database.
# Primarily, we need the  rental_id, rental_date, film_id and title data.
query = 'SELECT  f.film_id, f.title, c.name,  f.length, f.rental_rate, f.rating, f.rental_duration, f.release_year from sakila.film f join sakila.film_category fc on f.film_id = fc.film_id join sakila.category c on(fc.category_id = c.category_id) group by f.film_id order by f.film_id;'
label_query = ('select j1.film_id, j1.title, (month(j1.rental_date)= 08) as August_rental, COUNT(j1.rental_date) from (SELECT r.rental_id, r.rental_date, i.film_id, c.name, f.title, f.length, f.rental_rate, f.rating, f.rental_duration, f.release_year from sakila.rental r join sakila.inventory i using(inventory_id) join sakila.film f on i.film_id = f.film_id join sakila.film_category fc on f.film_id = fc.film_id join sakila.category c on(fc.category_id = c.category_id)where YEAR(convert(r.rental_date,DATE)) = 2005 and YEAR(convert(r.rental_date,DATE)) is not null group by r.rental_id) as j1 where MONTH(convert(j1.rental_date,DATE)) = 08 group by j1.film_id order by film_id;') 
rentals = pd.read_sql_query(query, sakila_engine)
August_rentals = pd.read_sql_query(label_query, sakila_engine)

August_rentals.rename(columns ={'COUNT(j1.rental_date)': 'August rental'}, inplace = True)

In [31]:
# View the data types
rentals.dtypes
# View the shape of the data set
rentals.shape

# Add a column for rented in August to filter out films NOT rented in August
rentals['August Rental'] = 0





In [32]:
# Compare rentals to target dataframe based on non-similar movie ids.
rented_Aug =  August_rentals['title'].tolist()
rented_Aug
full_film_list = rentals['title'].tolist()
full_film_list
not_rented = pd.DataFrame([x for x in full_film_list if x not in rented_Aug],columns = ['title'])
not_rented['August_rental'] = 0
full_ids =rentals['film_id'].tolist()
rented_ids =August_rentals['film_id'].tolist()
not_rented['film_id'] =pd.DataFrame([x for x in full_ids if x not in rented_ids],columns = ['title'])
not_rented




Unnamed: 0,title,August_rental,film_id
0,ALICE FANTASIA,0,14
1,APOLLO TEEN,0,33
2,ARGONAUTS TOWN,0,36
3,ARK RIDGEMONT,0,38
4,ARSENIC INDEPENDENCE,0,41
5,BOONDOCK BALLROOM,0,87
6,BUTCH PANTHER,0,108
7,CATCH AMISTAD,0,128
8,CHINATOWN GLADIATOR,0,144
9,CHOCOLATE DUCK,0,148


In [33]:
# Add the films not rented in August to the target set.
y = pd.concat([August_rentals,not_rented],axis = 0)
y.fillna(0, inplace= True)
y

Unnamed: 0,film_id,title,August_rental,August rental
0,1,ACADEMY DINOSAUR,1,9.0
1,2,ACE GOLDFINGER,1,4.0
2,3,ADAPTATION HOLES,1,6.0
3,4,AFFAIR PREJUDICE,1,6.0
4,5,AFRICAN EGG,1,3.0
...,...,...,...,...
37,909,TREASURE COMMAND,0,0.0
38,943,VILLAIN DESPERATE,0,0.0
39,950,VOLUME HOUSE,0,0.0
40,954,WAKE JAWS,0,0.0


In [41]:
# View the target data set. 
y.shape
#August_rentals.dtypes
#August_rentals.isna().sum()
y.rename(columns = {'August rental':'Rental_counts'}, inplace = True)
y.sort_values(by = 'film_id', inplace = True)
y.reset_index(drop = True,inplace = True)
y

Unnamed: 0,title,August_rental,Rental_counts
0,ACADEMY DINOSAUR,1,9.0
1,ACE GOLDFINGER,1,4.0
2,ADAPTATION HOLES,1,6.0
3,AFFAIR PREJUDICE,1,6.0
4,AFRICAN EGG,1,3.0
...,...,...,...
995,YOUNG LANGUAGE,1,3.0
996,YOUTH KICK,1,3.0
997,ZHIVAGO CORE,1,2.0
998,ZOOLANDER FICTION,1,7.0


In [52]:
# Review statistical information about the predictors
rentals.describe(include = 'all', datetime_is_numeric= True)
X = pd.concat([rentals,y], axis = 1).reset_index()
X.drop(['index'],axis = 1, inplace = True)
X


Unnamed: 0,film_id,title,name,length,rental_rate,rating,rental_duration,release_year,August Rental,title.1,August_rental,Rental_counts
0,1,ACADEMY DINOSAUR,Documentary,86,0.99,PG,6,2006,0,ACADEMY DINOSAUR,1,9.0
1,2,ACE GOLDFINGER,Horror,48,4.99,G,3,2006,0,ACE GOLDFINGER,1,4.0
2,3,ADAPTATION HOLES,Documentary,50,2.99,NC-17,7,2006,0,ADAPTATION HOLES,1,6.0
3,4,AFFAIR PREJUDICE,Horror,117,2.99,G,5,2006,0,AFFAIR PREJUDICE,1,6.0
4,5,AFRICAN EGG,Family,130,2.99,G,6,2006,0,AFRICAN EGG,1,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,YOUNG LANGUAGE,Documentary,183,0.99,G,6,2006,0,YOUNG LANGUAGE,1,3.0
996,997,YOUTH KICK,Music,179,0.99,NC-17,4,2006,0,YOUTH KICK,1,3.0
997,998,ZHIVAGO CORE,Horror,105,0.99,NC-17,6,2006,0,ZHIVAGO CORE,1,2.0
998,999,ZOOLANDER FICTION,Children,101,2.99,R,5,2006,0,ZOOLANDER FICTION,1,7.0


In [53]:
# split data into training and testing sets.
from sklearn.model_selection import train_test_split

# Remove non-essential columns from the training datasets.
# 'title' has too many unique values to be useful as a categorical predictor.
# 'release_year' shows no variation across the distribution.
# 'August Rental' is directly correlated with the target.
X = X.drop(['title','release_year','August Rental','August_rental'],axis = 1)
y = y['August_rental']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)
# Review split data.
display(X_train.shape)
display(X_test.shape)
display(y_train.shape)
display(y_test.shape)
X_train.head()

(700, 7)

(300, 7)

(700,)

(300,)

Unnamed: 0,film_id,name,length,rental_rate,rating,rental_duration,Rental_counts
541,542,Action,52,2.99,G,3,4.0
440,441,Documentary,125,2.99,PG-13,5,3.0
482,483,Games,171,2.99,NC-17,3,7.0
422,423,Children,69,0.99,PG,7,6.0
778,779,Sports,54,4.99,R,4,2.0


# Transform the data for modelling
1.
2. Split the training and testing data into numerical and categorical information for normalization.

In [54]:
# Transform data for model training and testing
X_train_nums = X_train._get_numeric_data()
X_test_nums = X_test._get_numeric_data()
X_train_cats = X_train.select_dtypes(include = 'object')
X_test_cats = X_test.select_dtypes(include = 'object')
X_train_cats

Unnamed: 0,name,rating
541,Action,G
440,Documentary,PG-13
482,Games,NC-17
422,Children,PG
778,Sports,R
...,...,...
106,Drama,G
270,Action,G
860,Games,R
435,New,NC-17


In [55]:
from sklearn.preprocessing import StandardScaler
# define function to output normalized and standardized training and testing data.

def num_scaler(Xtraining, Xtesting):
    # Normalize the numerical data with MinMaxScaler
    normal_transform = MinMaxScaler().fit(Xtraining)
    # Normalize the numerical date with StandardScaler
    standard_transform = StandardScaler().fit(Xtraining)
    # Transform the training data
    Xtrain_normalized = pd.DataFrame(normal_transform.transform(Xtraining), columns = Xtraining.columns)
    Xtrain_standardized = pd.DataFrame(standard_transform.transform(Xtraining), columns = Xtraining.columns)
    # Transform the testing data
    Xtest_normalized = pd.DataFrame(normal_transform.transform(Xtesting), columns = Xtraining.columns)
    Xtest_standardized = pd.DataFrame(standard_transform.transform(Xtesting), columns = Xtraining.columns)
    return Xtrain_normalized, Xtrain_standardized, Xtest_normalized, Xtest_standardized

# Transform training and testing data.
X_train_norm, X_train_stand, X_test_norm, X_test_stand = num_scaler(X_train_nums,X_test_nums)


In [56]:
# Transform categorical data
from sklearn.preprocessing import OneHotEncoder

def cat_encoder(Xtraining, Xtesting):
    # create encoder object
    OH_encoder = OneHotEncoder().fit(Xtraining)
    #create header list
    category_columns = []
    for columns in Xtraining.columns:
        for column_header in Xtraining[columns].unique():
            category_columns.append(column_header)
    # encode training data and re-assign result as dataframe with column names
    Xtrain_encoded = OH_encoder.transform(Xtraining).toarray()        
    Xtrain_encoded = pd.DataFrame(Xtrain_encoded, columns = category_columns)
    # encode testing data and re-assign result as dataframe with column names
    Xtest_encoded = OH_encoder.transform(Xtesting).toarray()        
    Xtest_encoded = pd.DataFrame(Xtest_encoded, columns = category_columns)
    
    # return the encoded datas frames
    return Xtrain_encoded, Xtest_encoded

X_train_cats_encoded, X_test_cats_encoded = cat_encoder(X_train_cats, X_test_cats)
X_train_cats_encoded.shape
X_test_cats_encoded.shape
X_train_cats_encoded.head()





Unnamed: 0,Action,Documentary,Games,Children,Sports,Classics,Comedy,New,Foreign,Animation,...,Music,Family,Drama,Travel,Horror,G,PG-13,NC-17,PG,R
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [57]:
# Concatenate numerical and categorical data for training and testing.
X_train1 = pd.concat([X_train_norm,X_train_cats_encoded], axis = 1)
X_train2 = pd.concat([X_train_stand,X_train_cats_encoded], axis = 1)
X_test1 = pd.concat([X_test_norm,X_test_cats_encoded], axis = 1)
X_test2 = pd.concat([X_test_norm,X_test_cats_encoded], axis = 1)

X_train1.head()


Unnamed: 0,film_id,length,rental_rate,rental_duration,Rental_counts,Action,Documentary,Games,Children,Sports,...,Music,Family,Drama,Travel,Horror,G,PG-13,NC-17,PG,R
0,0.541542,0.043165,0.5,0.0,0.307692,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.44044,0.568345,0.5,0.5,0.230769,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.482482,0.899281,0.5,0.0,0.538462,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.422422,0.165468,0.0,1.0,0.461538,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.778779,0.057554,1.0,0.25,0.153846,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [58]:
# Import model object for training.
from sklearn.linear_model import LogisticRegression

# Instantiate and train model
LR_Model1 = LogisticRegression(random_state=34, solver='lbfgs',multi_class='multinomial').fit(X_train2, y_train)
Train1_predictions = LR_Model1.predict(X_train2) 
classification_score = LR_Model1.score(X_train2, y_train)
classification_score


1.0

In [59]:
# Test model performance
predictions1 = LR_Model1.predict(X_test2)
test_score1 = LR_Model1.score(X_test2, y_test)
test_score1

0.9666666666666667

In [60]:
# Review model performance using confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions1)



array([[  0,  10],
       [  0, 290]], dtype=int64)

In [61]:
pd.Series(predictions1).value_counts()

1    300
dtype: int64

In [62]:
# Try: KNN classifier: look at nearest neighbours and use the majority to determine class
from sklearn import neighbors
knn_model = neighbors.KNeighborsClassifier(n_neighbors=2, weights='uniform')
knn_model.fit(X_train1, y_train)
predictions_knn = knn_model.predict(X_test1)
knn_model.score(X_test1, y_test)

0.9