In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import kagglehub
import os
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

Відкриваємо датасет

In [None]:
path = kagglehub.dataset_download("soumyodippal000/top-2000-companies-financial-data-2024-dataset")
files = os.listdir(path)
print("Files in dataset directory:", files)

csv_file = [f for f in files if f.endswith(".csv")][0]  # Знаходимо перший CSV-файл
df = pd.read_csv(os.path.join(path, csv_file))
df = df.drop(columns=['Unnamed: 0', 'Name', 'Country'])

df.info()

/root/.cache/kagglehub/datasets/soumyodippal000/top-2000-companies-financial-data-2024-dataset/versions/1
Files in dataset directory: ['Top 2000 Companies Financial Data 2024.csv', 'edited_data.csv']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2001 entries, 0 to 2000
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Sales         2001 non-null   object
 1   Profit        2001 non-null   object
 2   Assets        2001 non-null   object
 3   Market Value  2001 non-null   object
dtypes: object(4)
memory usage: 62.7+ KB


Створимо функцію для очищення даних

In [None]:
def clean_value(value):
    cleaned = value.replace("$", "").replace("B", "").replace("M", "").replace(",", "").strip()
    return cleaned

In [None]:
df.dropna()
x = np.array(df[['Sales', 'Profit', 'Assets']])
for i in range(x.shape[0]): #редагуємо дані
  for j in range(x.shape[1]):
    if isinstance(value, str):
      cleaned = x[i, j].replace("$", "").replace("B", "").replace("M", "").replace(",", "").strip()
      value = float(cleaned)
      x[i, j] = value


y = np.array(df['Market Value'])  # приймає приймає 3 параметри (3 класи фіч), повертає ринкову вартість
df['Market Value'] = df['Market Value'].apply(clean_value)
pca = PCA(n_components=2)
x_2 = pca.fit_transform(x)


edited_file_path = os.path.join(path, 'edited_data.csv')
df.to_csv(edited_file_path, index=False)

print(df)

      Sales  Profit  Assets Market Value
0     252.9    50.0  4090.7        588.1
1     369.0    73.4  1070.0        899.1
2     489.1   116.9   661.5       1919.3
3     223.8    50.4  6586.0        215.2
4     183.3    25.0  3273.8        307.3
...     ...     ...     ...          ...
1996  201.0   632.7     8.6         11.3
1997    3.9   460.2    33.2            6
1998   15.2   158.0     9.3          2.6
1999    2.2   134.8    25.9           10
2000    5.1   812.8    11.6            9

[2001 rows x 4 columns]


Векторизація датасету

In [None]:
all_features = []
all_targets = []
with open(edited_file_path) as f:
    for i, line in enumerate(f):
        if i == 0:
            print("HEADER:", line.strip())
            continue
        fields = line.strip().split(",")
        all_features.append([float(v.replace('"', "")) for v in fields[:-1]])
        all_targets.append([float(fields[0].replace('"', ""))])
        if i == 1:
            print("EXAMPLE FEATURES:", all_features[-1])
features = np.array(all_features, dtype="float32")
targets = np.array(all_targets, dtype="float32")
print("features.shape:", features.shape)
print("targets.shape:", targets.shape)

HEADER: Sales,Profit,Assets,Market Value
EXAMPLE FEATURES: [252.9, 50.0, 4090.7]
features.shape: (2001, 3)
targets.shape: (2001, 1)


Готуємо тестовий датасет

In [None]:
num_val_samples = int(len(features) * 0.2)
train_features = features[:-num_val_samples]
train_targets = targets[:-num_val_samples]
val_features = features[-num_val_samples:]
val_targets = targets[-num_val_samples:]

print("Number of training samples:", len(train_features))
print("Number of validation samples:", len(val_features))

Number of training samples: 1601
Number of validation samples: 400


In [None]:
counts = np.bincount(train_targets[:, 0])
print(
    "Number of positive samples in training data: {} ({:.2f}% of total)".format(
        counts[1], 100 * float(counts[1]) / len(train_targets)
    )
)

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

Number of positive samples in training data: 16 (1.00% of total)


  weight_for_0 = 1.0 / counts[0]


In [None]:
mean = np.mean(train_features, axis=0)
train_features -= mean
val_features -= mean
std = np.std(train_features, axis=0)
train_features /= std
val_features /= std

In [None]:
import keras

model = keras.Sequential(
    [
        keras.Input(shape=train_features.shape[1:]),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model.summary()

In [None]:
metrics = [
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.0001), loss="binary_crossentropy", metrics=["accuracy"]
)

#callbacks = [keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.h5")]
class_weight = {0: weight_for_0, 1: weight_for_1}

model.fit(
    train_features,
    train_targets,
    batch_size=2048,
    epochs=30,
    verbose=2,
    #callbacks=callbacks,
    validation_data=(val_features, val_targets),
    class_weight=class_weight,
)

Epoch 1/30
1/1 - 2s - 2s/step - accuracy: 0.0044 - loss: 1.1686 - val_accuracy: 0.1250 - val_loss: 0.8404
Epoch 2/30
1/1 - 0s - 119ms/step - accuracy: 0.0062 - loss: 0.3727 - val_accuracy: 0.1375 - val_loss: -2.5241e-01
Epoch 3/30
1/1 - 0s - 102ms/step - accuracy: 0.0075 - loss: -2.6955e-01 - val_accuracy: 0.1375 - val_loss: -1.3440e+00
Epoch 4/30
1/1 - 0s - 135ms/step - accuracy: 0.0081 - loss: -1.3882e+00 - val_accuracy: 0.1375 - val_loss: -2.4319e+00
Epoch 5/30
1/1 - 0s - 132ms/step - accuracy: 0.0087 - loss: -1.5683e+00 - val_accuracy: 0.1375 - val_loss: -3.5028e+00
Epoch 6/30
1/1 - 0s - 89ms/step - accuracy: 0.0100 - loss: -2.9462e+00 - val_accuracy: 0.1375 - val_loss: -4.5771e+00
Epoch 7/30
1/1 - 0s - 149ms/step - accuracy: 0.0094 - loss: -3.9535e+00 - val_accuracy: 0.1375 - val_loss: -5.6712e+00
Epoch 8/30
1/1 - 0s - 136ms/step - accuracy: 0.0081 - loss: -4.4354e+00 - val_accuracy: 0.1375 - val_loss: -6.7831e+00
Epoch 9/30
1/1 - 0s - 133ms/step - accuracy: 0.0100 - loss: -4.7364

<keras.src.callbacks.history.History at 0x7afb2c2b3d00>

In [None]:
score = model.evaluate(train_features, train_targets, verbose=0)
print('Test accuracy:', score[1])

Test accuracy: 0.00999375432729721
