#Data preprocessing


In [None]:
import pandas as pd

df = pd.read_csv("flip_book_cleaned.csv")

df.drop("Unnamed: 0", axis=1, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2994 entries, 0 to 2993
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   title             2994 non-null   object 
 1   rating            2994 non-null   float64
 2   review_count      2994 non-null   float64
 3   price             2994 non-null   int64  
 4   publishing_house  2994 non-null   object 
 5   binding           2994 non-null   object 
 6   number_of_pages   2994 non-null   float64
 7   paper_type        2994 non-null   object 
 8   language          2994 non-null   object 
 9   release_date      2994 non-null   float64
 10  height            2994 non-null   float64
 11  width             2994 non-null   float64
 12  illustrated       2994 non-null   object 
 13  thickness         2994 non-null   float64
dtypes: float64(7), int64(1), object(6)
memory usage: 327.6+ KB


In [None]:
from sklearn.preprocessing import LabelEncoder

object_columns = df.select_dtypes(include=['object']).columns

label_encoder = LabelEncoder()

df[object_columns] = df[object_columns].apply(label_encoder.fit_transform)


In [None]:
df

Unnamed: 0,title,rating,review_count,price,publishing_house,binding,number_of_pages,paper_type,language,release_date,height,width,illustrated,thickness
0,2162,5.0,2.0,2841,135,4,304.0,5,5,2022.000000,215.000000,140.00000,1,22.596082
1,2005,5.0,19.0,2639,167,4,80.0,5,5,2010.000000,263.000000,202.00000,1,12.000000
2,2473,5.0,102.0,4135,167,4,912.0,5,5,2017.000000,219.000000,145.00000,1,40.000000
3,51,4.7,64.0,1990,130,1,304.0,5,5,2023.000000,180.000000,115.00000,1,13.000000
4,2002,4.8,91.0,2080,167,4,272.0,5,5,2016.000000,208.496604,144.30631,1,22.596082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2989,256,4.8,9.0,2564,37,4,208.0,6,5,2022.000000,224.000000,148.00000,1,16.000000
2990,2377,5.0,7.0,3217,167,4,384.0,5,5,1973.184035,205.000000,130.00000,1,25.000000
2991,25,5.0,13.0,940,167,4,416.0,1,5,2022.000000,209.000000,135.00000,1,22.000000
2992,1137,5.0,2.0,5033,40,4,896.0,5,5,2021.000000,210.000000,140.00000,1,40.000000


#Regression models


Linear regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

X = df.drop("price", axis=1 )
y = df['price']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_linear = LinearRegression()
model_linear.fit(X_train, y_train)

y_pred_linear = model_linear.predict(X_test)

r2_linear = r2_score(y_test, y_pred_linear)

print(f"R^2 score for Linear Regression: {r2_linear}")


R^2 score for Linear Regression: 0.5403032836949089


Random forest regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(random_state=42)
model_rf.fit(X_train, y_train)

y_pred_rf = model_rf.predict(X_test)

r2_rf = r2_score(y_test, y_pred_rf)

print(f"R^2 score for Random Forest Regression: {r2_rf}")

R^2 score for Random Forest Regression: 0.49603191715812944


Price prediction example

In [None]:
single_row_data = {
    'title': 200,
    'rating': 4.3,
    'review_count': 120,
    'publishing_house': 2,
    'binding': 2,
    'number_of_pages': 250,
    'paper_type': 1,
    'language': 5,
    'release_date': 2022,
    'height': 200.0,
    'width': 150.0,
    'illustrated': 0,
    'thickness': 12.5
}

single_row_df = pd.DataFrame([single_row_data])

y_pred_rf = model_rf.predict(single_row_df)
y_pred_linear = model_linear.predict(single_row_df)

print(f"Random forest regression: {y_pred_rf[0]}")
print(f"Linear regression: {y_pred_linear[0]}")
print(f"Average: {(y_pred_rf[0] + y_pred_linear[0])/ 2}")

Random forest regression: 4648.3
Linear regression: 2030.7557223018662
Average: 3339.527861150933


#Classification models

In [None]:
X = df.drop("publishing_house", axis=1)
y = df["publishing_house"]


object_columns = X.select_dtypes(include=['object']).columns

label_encoder = LabelEncoder()

X[object_columns] = X[object_columns].apply(label_encoder.fit_transform)



0              Росмэн
1               Эксмо
2               Эксмо
3       РИПОЛ классик
4               Эксмо
            ...      
2989              АСТ
2990            Эксмо
2991            Эксмо
2992           Азбука
2993              АСТ
Name: publishing_house, Length: 2994, dtype: object

Random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_rf_cls = RandomForestClassifier(random_state=42)
model_rf_cls.fit(X_train, y_train)

y_pred_rf_cls = model_rf_cls.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf_cls)
print(f"Accuracy (Random Forest): {accuracy_rf}")

Accuracy (Random Forest): 0.5742904841402338


KNN classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

k_neighbors = 10
model_knn = KNeighborsClassifier(n_neighbors=k_neighbors)
model_knn.fit(X_train, y_train)

y_pred_knn = model_knn.predict(X_test)

accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"Accuracy (KNN): {accuracy_knn}")

Accuracy (KNN): 0.337228714524207


Single row prediction

In [None]:
single_row_data = {
    'title': 200,
    'rating': 4.3,
    'review_count': 120,
    'price': X["price"].mean(),
    'binding': 2,
    'number_of_pages': 250,
    'paper_type': 1,
    'language': 5,
    'release_date': 2022,
    'height': 200.0,
    'width': 150.0,
    'illustrated': 0,
    'thickness': 12.5
}

single_row_df = pd.DataFrame([single_row_data])

print(f"Random forest classifier: {model_rf_cls.predict(single_row_df)}")
print(f"KNN: {model_knn.predict(single_row_df)}")

Random forest classifier: ['Попурри']
KNN: ['АСТ']
