In [6]:
import sys
!{sys.executable} -m pip install -U scikit-learn pandas numpy pillow pyparsing

Defaulting to user installation because normal site-packages is not writeable


In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer

### Data preparation

In [8]:
df = pd.DataFrame()
for root, _, files in os.walk("data"):
    for filename in files:
        temp_df = pd.read_excel(os.path.join(root, filename), "Sheet1")
        temp_df.drop(columns=[temp_df.columns[0], "Разница в долларах", "Дельта в процентах"], axis=1, inplace=True, errors="ignore")
        df = pd.concat([df, temp_df], axis=0, sort=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24484 entries, 0 to 1497
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Время          16445 non-null  object 
 1   Цена до        16445 non-null  float64
 2   Цена после     16445 non-null  float64
 3   Текст новости  16445 non-null  object 
 4   Аннотация      4780 non-null   object 
 5   Дата           11488 non-null  object 
 6   Unnamed: 1     842 non-null    object 
 7   Unnamed: 2     842 non-null    float64
 8   Unnamed: 3     842 non-null    float64
 9   Unnamed: 4     842 non-null    object 
 10  Unnamed: 5     834 non-null    object 
dtypes: float64(4), object(7)
memory usage: 2.2+ MB


In [9]:
df = df.drop_duplicates(subset=["Текст новости"], keep=False)
df.rename(columns={"Цена до": "price_before", "Цена после": "price_after", "Дата": "date", "Время": "Time",
                   "Текст новости": "news_text", "Аннотация": "annotation"}, inplace=True)

df["absolute_price_difference"] = df["price_after"] - df["price_before"]
df["news_text"] = df["news_text"] + ". " + df["annotation"].fillna("")
df["label"] = np.where(df["absolute_price_difference"] > 0, 1, 0)
df = df[["news_text", "label"]].copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6112 entries, 0 to 1009
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   news_text  6112 non-null   object
 1   label      6112 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 143.2+ KB


In [10]:
text_transformer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))

In [11]:
train_data = df.sample(frac=0.8, random_state=42)

# Testing dataset
test_data = df.drop(train_data.index)

In [12]:
X_train_text = text_transformer.fit_transform(train_data["news_text"])
X_test_text = text_transformer.transform(test_data["news_text"])

In [13]:
X_train_text.shape, X_test_text.shape

((4890, 70915), (43, 70915))

In [27]:
logit = LogisticRegression(C=5e1, solver="liblinear", random_state=17)

In [28]:
logit.fit(X_train_text, train_data["label"])

In [29]:
test_preds = logit.predict(X_test_text)

In [32]:
cv_results = cross_val_score(logit, X_train_text, train_data["label"], cv=5, scoring="recall_macro")

print(cv_results, cv_results.mean())

[0.58760672 0.58680147 0.57603835 0.57982615 0.59546078] 0.5851466956318155


In [31]:
from sklearn.metrics import accuracy_score, f1_score

print("Logistic Regression accuracy: " + str(accuracy_score(test_data["label"], test_preds)))
print("Logistic Regression f1 score: " + str(f1_score(test_data["label"], test_preds)))

Logistic Regression accuracy: 0.7674418604651163
Logistic Regression f1 score: 0.7727272727272727
