In [4]:
!pip install -U scikit-learn pandas numpy pillow pyparsing openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
     -------------------------------------- 250.0/250.0 KB 1.1 MB/s eta 0:00:00
Collecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2


You should consider upgrading via the 'd:\python3.9\python.exe -m pip install --upgrade pip' command.


In [5]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer

### Data preparation

In [6]:
df = pd.DataFrame()
for root, _, files in os.walk("data"):
    for filename in files:
        temp_df = pd.read_excel(os.path.join(root, filename), "Sheet1")
        temp_df.drop(columns=[temp_df.columns[0], "Разница в долларах", "Дельта в процентах"], axis=1, inplace=True, errors="ignore")
        df = pd.concat([df, temp_df], axis=0, sort=False)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24484 entries, 0 to 1497
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Дата           11488 non-null  object 
 1   Время          16445 non-null  object 
 2   Цена до        16445 non-null  float64
 3   Цена после     16445 non-null  float64
 4   Текст новости  16445 non-null  object 
 5   Unnamed: 1     842 non-null    object 
 6   Unnamed: 2     842 non-null    float64
 7   Unnamed: 3     842 non-null    float64
 8   Unnamed: 4     842 non-null    object 
 9   Unnamed: 5     834 non-null    object 
 10  Аннотация      4780 non-null   object 
dtypes: float64(4), object(7)
memory usage: 2.2+ MB


In [7]:
df = df.drop_duplicates(subset=["Текст новости"], keep=False)
df.rename(columns={"Цена до": "price_before", "Цена после": "price_after", "Дата": "date", "Время": "Time",
                   "Текст новости": "news_text", "Аннотация": "annotation"}, inplace=True)

df["absolute_price_difference"] = df["price_after"] - df["price_before"]
df["news_text"] = df["news_text"] + ". " + df["annotation"].fillna("")
df["label"] = np.where(df["absolute_price_difference"] > 0, 1, 0)
df = df[["news_text", "label"]].copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6112 entries, 1 to 1009
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   news_text  6112 non-null   object
 1   label      6112 non-null   int32 
dtypes: int32(1), object(1)
memory usage: 119.4+ KB


In [8]:
text_transformer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))

In [9]:
train_data = df.sample(frac=0.8, random_state=42)

# Testing dataset
test_data = df.drop(train_data.index)

In [10]:
X_train_text = text_transformer.fit_transform(train_data["news_text"])
X_test_text = text_transformer.transform(test_data["news_text"])

In [11]:
X_train_text.shape, X_test_text.shape

((4890, 70088), (52, 70088))

In [12]:
logit = LogisticRegression(C=5e1, solver="liblinear", random_state=17)

In [13]:
logit.fit(X_train_text, train_data["label"])

In [14]:
test_preds = logit.predict(X_test_text)

In [15]:
cv_results = cross_val_score(logit, X_train_text, train_data["label"], cv=5, scoring="recall_macro")

print(cv_results, cv_results.mean())

[0.58184689 0.60731896 0.59291943 0.56446352 0.60830071] 0.5909699007970611


In [16]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix

# Calculate evaluation metrics
accuracy = accuracy_score(test_data["label"], test_preds)
precision = precision_score(test_data["label"], test_preds)
recall = recall_score(test_data["label"], test_preds)
f1 = f1_score(test_data["label"], test_preds)
auc_roc = roc_auc_score(test_data["label"], test_preds)
conf_matrix = confusion_matrix(test_data["label"], test_preds)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("AUC-ROC:", auc_roc)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.6923076923076923
Precision: 0.6363636363636364
Recall: 0.84
F1-Score: 0.7241379310344828
AUC-ROC: 0.6977777777777777
Confusion Matrix:
[[15 12]
 [ 4 21]]
