In [126]:
import psycopg2 as ps
import pandas as pd
from typing import List
from functools import partial

DB_HOST = 'localhost'
DB_PORT = 5433
DB_USER = 'postgres'
DB_PASSWORD = 'postgres' #bad practice
DB_DATABASE = 'Sakila_Sample_Database'

DATA_PATH = './dataframes/'

CONNECTION_STR = f"host={DB_HOST} port={DB_PORT} dbname={DB_DATABASE} user={DB_USER} password={DB_PASSWORD}"


QUERY = '''
select 
	film.film_id,
	film.title,
	film.description,
	film.release_year,
	film.rental_duration,
	film.rental_rate,
	film.length as duration,
	film.replacement_cost,
	film.rating,
	film.special_features,
	language.name as language,
	category.name as category,
	actor.first_name,
	actor.last_name,
	inventory.store_id,
    inventory.inventory_id 
from film
	join film_actor on film.film_id = film_actor.film_id 
	join language on film.language_id = language.language_id 
	join film_category on film.film_id = film_category.film_id
	join category on film_category.category_id = category.category_id 
	join inventory on film.film_id = inventory.film_id 
	join actor on actor.actor_id = film_actor.actor_id'''

def make_query(query: str, conn_str: str = ''):
    '''Create a query and fetching data from database
    '''
    if not conn_str:
        raise ValueError('no db connection str')

    with ps.connect(CONNECTION_STR) as conn:
        cur = conn.cursor()
        cur.execute(query)
        result = list(cur.fetchall())

    return result

db_fetch = partial(make_query, conn_str = CONNECTION_STR)

def save_to_csv(data: List[tuple], columns, filename: str):
    pd.DataFrame(data, columns=columns).to_csv(DATA_PATH + filename, index = False)


result = db_fetch(QUERY)

#replace memoryview with string representation 
ready_result = []   
for row in result:
    ready_result.append([bytes(i).decode("utf-8") if isinstance(i, memoryview) else i for i in row])


columns = ['film_id', 
           'title',
           'description',
           'release_year',
           'rental_duration',
           'rental_rate',
           'duration',
           'replacement_cost',
           'rating',
           'special_features',
           'language',
           'category',
           'first_name',
           'last_name',
           'store_id',
           'inventory_id']
save_to_csv(ready_result, columns, 'all_columns.csv')



df = pd.read_csv(DATA_PATH + 'all_columns.csv')

df.head()

Unnamed: 0,film_id,title,description,release_year,rental_duration,rental_rate,duration,replacement_cost,rating,special_features,language,category,first_name,last_name,store_id,inventory_id
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English,Documentary,PENELOPE,GUINESS,2,8
1,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English,Documentary,PENELOPE,GUINESS,2,7
2,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English,Documentary,PENELOPE,GUINESS,2,6
3,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English,Documentary,PENELOPE,GUINESS,2,5
4,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English,Documentary,PENELOPE,GUINESS,1,4


In [127]:
int_to_cat_cols = ('film_id', 'store_id', 'inventory_id', 'category', 'rating', 'release_year', 'special_features', 'language', 'first_name', 'last_name', 'title')   #convert columns to cat feature 

for col in int_to_cat_cols:
    df[col] = df[col].astype('category')



df_num = df.select_dtypes(include='number')

#Количество данных, минимальное, максимально значение значение 1 2 и 3 квартилей (25, 50 75%)
des_num = pd.DataFrame(df.describe())

print(des_num)

       rental_duration   rental_rate      duration  replacement_cost
count     25143.000000  25143.000000  25143.000000      25143.000000
mean          4.909120      2.925489    115.152925         20.188624
std           1.384962      1.632384     40.422242          6.084221
min           3.000000      0.990000     46.000000          9.990000
25%           4.000000      0.990000     80.000000         14.990000
50%           5.000000      2.990000    114.000000         20.990000
75%           6.000000      4.990000    150.000000         24.990000
max           7.000000      4.990000    185.000000         29.990000


In [128]:
#количество пропусков 
df.isna().sum()

film_id             0
title               0
description         0
release_year        0
rental_duration     0
rental_rate         0
duration            0
replacement_cost    0
rating              0
special_features    0
language            0
category            0
first_name          0
last_name           0
store_id            0
inventory_id        0
dtype: int64

In [129]:
#Дисперсия по числовым данным 

var_ = df_num.var()

des_num.loc['var'] = pd.DataFrame(var_)[0]

print(var_)

rental_duration        1.918119
rental_rate            2.664679
duration            1633.957641
replacement_cost      37.017742
dtype: float64


In [130]:
#квантиль 0,1 0.9
quant_0_1 = df_num.quantile(0.1)
quant_0_9 = df_num.quantile(0.9)

des_num.loc['10%'] = pd.DataFrame(quant_0_1)[0.1]
des_num.loc['90%'] = pd.DataFrame(quant_0_9)[0.9]

print(quant_0_1, quant_0_9, sep= '\n\n')

rental_duration      3.00
rental_rate          0.99
duration            60.00
replacement_cost    11.99
Name: 0.1, dtype: float64

rental_duration       7.00
rental_rate           4.99
duration            173.00
replacement_cost     28.99
Name: 0.9, dtype: float64


In [131]:
#медиана
med = df_num.median()

des_num.loc['median'] = pd.DataFrame(med)[0]

print(med)


rental_duration       5.00
rental_rate           2.99
duration            114.00
replacement_cost     20.99
dtype: float64


In [132]:
df_cat = df.select_dtypes(include=['category'])

df_cat_des = pd.DataFrame(columns=df_cat.columns)

#Доля пропусков
isna_sum = df_cat.isna().sum()

df_cat_des.loc['isnasum'] = isna_sum

print(isna_sum)

film_id             0
title               0
release_year        0
rating              0
special_features    0
language            0
category            0
first_name          0
last_name           0
store_id            0
inventory_id        0
dtype: int64


In [133]:
nunique = df_cat.nunique()

df_cat_des.loc['nunique'] = nunique

print('Количество уникальных значений в колонках\n', nunique, sep='\n')

Количество уникальных значений в колонках

film_id              955
title                955
release_year           1
rating                 5
special_features      15
language               1
category              16
first_name           128
last_name            121
store_id               2
inventory_id        4568
dtype: int64


In [134]:
modes = pd.DataFrame(df_cat.mode())

print('Самое частое значение в колонках')
print(modes.loc[0])

df_cat_des.loc['mode'] = modes.loc[0]

Самое частое значение в колонках
film_id                                                 880
title                                TELEMARK HEARTBREAKERS
release_year                                           2006
rating                                                PG-13
special_features    Trailers,Commentaries,Behind the Scenes
language                               English             
category                                             Sports
first_name                                         PENELOPE
last_name                                            KILMER
store_id                                                  2
inventory_id                                           2337
Name: 0, dtype: object


In [135]:
des_num.to_csv(DATA_PATH + 'numeric_stat.csv')
df_cat_des.to_csv(DATA_PATH + 'categorical_stat.csv')