In [1]:
# Importing all dedicated libraries
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import pickle
import pymysql
from sqlalchemy import create_engine
import getpass  
from sklearn import neighbors

In [2]:
pd.set_option('display.max_rows', 50)

In [3]:
password = getpass.getpass()

········


In [52]:
#get the data
connection_string = 'mysql+pymysql://root:'+password+'@localhost/sakila'
engine = create_engine(connection_string)
query = '''SELECT f.title, r.rental_date, c.name, f.rental_rate, r.customer_id
FROM category c
JOIN film_category fc
USING (category_id)
JOIN film f
USING (film_id)
JOIN inventory i
USING (film_id)
JOIN rental r
USING (inventory_id);'''

df = pd.read_sql_query(query, engine)
df.head()

Unnamed: 0,title,rental_date,name,rental_rate,customer_id
0,AMADEUS HOLY,2005-08-02 01:16:59,Action,0.99,77
1,AMADEUS HOLY,2005-08-18 04:26:54,Action,0.99,39
2,AMADEUS HOLY,2005-06-20 20:35:28,Action,0.99,34
3,AMADEUS HOLY,2005-07-09 05:01:58,Action,0.99,254
4,AMADEUS HOLY,2005-07-30 08:02:39,Action,0.99,276


In [5]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [53]:
len(df['title'].unique())

958

In [54]:
# shape of the df
df.shape

(16044, 5)

In [55]:
# ordering the data
df= df[['title', 'name', 'rental_rate', 'customer_id', 'rental_date']]

In [56]:
# renaming columns
df.columns= ['title', 'category_name', 'rental_rate', 'customer_id', 'rental_date']

In [57]:
# types of the columns
df.dtypes

title                    object
category_name            object
rental_rate             float64
customer_id               int64
rental_date      datetime64[ns]
dtype: object

In [58]:
# new column for movies that were rented in august of that year
df['rented_aug']= df['rental_date'].dt.month==8

In [59]:
df['rented_aug'] = df['rented_aug'].astype(int)

In [60]:
df.head()

Unnamed: 0,title,category_name,rental_rate,customer_id,rental_date,rented_aug
0,AMADEUS HOLY,Action,0.99,77,2005-08-02 01:16:59,1
1,AMADEUS HOLY,Action,0.99,39,2005-08-18 04:26:54,1
2,AMADEUS HOLY,Action,0.99,34,2005-06-20 20:35:28,0
3,AMADEUS HOLY,Action,0.99,254,2005-07-09 05:01:58,0
4,AMADEUS HOLY,Action,0.99,276,2005-07-30 08:02:39,0


In [61]:
# dropping the cust_id and rental_date col to try without them first 
df.drop(columns= ['customer_id', 'rental_date'], inplace= True)

In [62]:
df.head()

Unnamed: 0,title,category_name,rental_rate,rented_aug
0,AMADEUS HOLY,Action,0.99,1
1,AMADEUS HOLY,Action,0.99,1
2,AMADEUS HOLY,Action,0.99,0
3,AMADEUS HOLY,Action,0.99,0
4,AMADEUS HOLY,Action,0.99,0


In [15]:
# creating a new column for how many times was the movie rented throughout the year
df['times_rented'] = df['title'].map(df['title'].value_counts())

In [16]:
# getting info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16044 entries, 0 to 16043
Data columns (total 5 columns):
title            16044 non-null object
category_name    16044 non-null object
rental_rate      16044 non-null float64
rented_aug       16044 non-null int64
times_rented     16044 non-null int64
dtypes: float64(1), int64(2), object(2)
memory usage: 626.8+ KB


In [17]:
# get uniques
def get_unique(column):
    return df[column].unique()


get_unique('category_name')
get_unique('rental_rate')
get_unique('rented_aug')
get_unique('times_rented')

array(['Action', 'Animation', 'Children', 'Classics', 'Comedy',
       'Documentary', 'Drama', 'Family', 'Foreign', 'Games', 'Horror',
       'Music', 'New', 'Sci-Fi', 'Sports', 'Travel'], dtype=object)

array([0.99, 4.99, 2.99])

array([1, 0])

array([21, 22, 10, 18, 19, 16,  9, 24, 25, 12, 14, 11, 15, 23, 26,  8, 28,
       20, 13, 17, 27, 30,  6,  7, 32, 29,  5, 31,  4, 33, 34])

In [18]:
df.columns

Index(['title', 'category_name', 'rental_rate', 'rented_aug', 'times_rented'], dtype='object')

In [19]:
# re-ordering data
df= df[['title', 'category_name', 'rental_rate', 'times_rented', 'rented_aug']]

In [20]:
df.shape
df['rented_aug'].value_counts()

(16044, 5)

0    10358
1     5686
Name: rented_aug, dtype: int64

In [21]:
# ordering data by repeated titles then true false
df= df.sort_values(by= ['title', 'rented_aug'], ascending= [True, False])

In [22]:
# keeping only the unique titles and all true movies rented in august except for those that were never rented in august
df_unique= df.drop_duplicates(subset= 'title')

In [23]:
df_unique.shape

(958, 5)

In [24]:
df_unique['rented_aug'].value_counts()

1    958
Name: rented_aug, dtype: int64

In [25]:
df_unique.head()
df_unique_tit = df_unique['title']

Unnamed: 0,title,category_name,rental_rate,times_rented,rented_aug
5104,ACADEMY DINOSAUR,Documentary,0.99,23,1
10311,ACE GOLDFINGER,Horror,4.99,7,1
5127,ADAPTATION HOLES,Documentary,2.99,12,1
10320,AFFAIR PREJUDICE,Horror,2.99,23,1
7216,AFRICAN EGG,Family,2.99,12,1


In [26]:
df_unique.drop(columns= 'title', inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [27]:
df_unique['rented_aug'] = df_unique['rented_aug'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [28]:
df_unique.dtypes

category_name     object
rental_rate      float64
times_rented       int64
rented_aug        object
dtype: object

In [29]:
df_unique["rented_aug"] = df_unique["rented_aug"].str.replace('1', 'T')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [30]:
df_unique.head()

Unnamed: 0,category_name,rental_rate,times_rented,rented_aug
5104,Documentary,0.99,23,T
10311,Horror,4.99,7,T
5127,Documentary,2.99,12,T
10320,Horror,2.99,23,T
7216,Family,2.99,12,T


In [31]:
df_unique.tail(1).index

Int64Index([5074], dtype='int64')

In [32]:
df_unique.at[5074,'rented_aug']='F'

In [33]:
df_unique.tail(-1)

Unnamed: 0,category_name,rental_rate,times_rented,rented_aug
10311,Horror,4.99,7,T
5127,Documentary,2.99,12,T
10320,Horror,2.99,23,T
7216,Family,2.99,12,T
8313,Foreign,2.99,21,T
...,...,...,...,...
6147,Documentary,0.99,7,T
11982,Music,0.99,6,T
11151,Horror,0.99,9,T
3208,Children,2.99,17,T


In [34]:
df_unique.dtypes

category_name     object
rental_rate      float64
times_rented       int64
rented_aug        object
dtype: object

 ### every movie was rentend in august .... why wouldnt that happen again???? THIS MAKES NOT SENSE AT ALL ! By just replying YES YES YES YES ... im assuring to have at least a 99% accuracy wether that specific movie will be rented in august next year if we based our analysis on the data we have ! 

In [35]:
df_unique.dtypes
df_unique.head(2)

category_name     object
rental_rate      float64
times_rented       int64
rented_aug        object
dtype: object

Unnamed: 0,category_name,rental_rate,times_rented,rented_aug
5104,Documentary,0.99,23,T
10311,Horror,4.99,7,T


In [36]:
X = df_unique.select_dtypes(include = np.number)

# Scaling data
transformer = MinMaxScaler().fit(X)
x_normalized = transformer.transform(X)
x = pd.DataFrame(x_normalized)

In [37]:
cat = df_unique.select_dtypes(include = np.object)
cat = cat.drop(['rented_aug'], axis=1) # target variable
categorical = pd.get_dummies(cat)
categorical.head()

Unnamed: 0,category_name_Action,category_name_Animation,category_name_Children,category_name_Classics,category_name_Comedy,category_name_Documentary,category_name_Drama,category_name_Family,category_name_Foreign,category_name_Games,category_name_Horror,category_name_Music,category_name_New,category_name_Sci-Fi,category_name_Sports,category_name_Travel
5104,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
10311,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
5127,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
10320,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
7216,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [38]:
y = df_unique['rented_aug']
X = np.concatenate((x, categorical), axis=1)

In [39]:
y.dtype

dtype('O')

In [40]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [41]:
y_train

1419     T
15533    T
10911    T
275      T
5199     T
        ..
13103    T
373      T
15141    T
8746     T
2347     T
Name: rented_aug, Length: 670, dtype: object

In [42]:
classification = LogisticRegression(random_state=0, solver='lbfgs',
                  multi_class='multinomial').fit(X_train, y_train)

In [43]:
classification = LogisticRegression(random_state=0, solver='saga',
                  multi_class='multinomial').fit(X_train, y_train)

In [44]:
predictions = classification.predict(X_test)
classification.score(X_test, y_test)
predictions

1.0

array(['T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T',
       'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T',
       'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T',
       'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T',
       'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T',
       'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T',
       'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T',
       'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T',
       'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T',
       'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T',
       'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T',
       'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T',
       'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T',
       'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T', 'T

In [45]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[288]])

In [46]:
clf = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform')
clf.fit(X, y)
predictions_clf = clf.predict(X_test)
clf.score(X_test, y_test)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

1.0

In [47]:
# no sense at all ...