In [76]:
import json
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from joblib import dump

### Przygotowanie danych

In [118]:
directory = "../../data/basic_training_data_by_weeks"
data = []
x_sample = {}
weeks = len(os.listdir(directory))
tracks = json.load(open("../../data/native_data/tracks/tracks.json", 'r'))

for week in range(weeks):
    file_paths = [directory + f"/basic_training_data_{week + i}.json" for i in range(1, 6)]

    if not os.path.exists(file_paths[-1]) or len(file_paths) != 5:
        print(f"Could not open file: {file_paths[-1]}")
        break

    with open(file_paths.pop(), 'r') as file_handler:
        tracks = json.loads(file_handler.read())


    for track in tracks:
        x_sample = {}
        x_sample["track_id"] = track.get('track_id')
        x_sample["play_count"] = track.get("play_count")
        for i in range(4):
            if not os.path.exists(file_paths[i]):
                x_sample[f"play_count_week_{i+1}"] = 0
            else:
                with open(file_paths[i], 'r') as file_handler:
                    file_json = json.loads(file_handler.read())
                    track_data = next((track_file for track_file in file_json if track_file['track_id'] == x_sample["track_id"]), None)
                    x_sample[f"play_count_week_{i+1}"] = track_data.get('play_count') if track_data else 0
            
        data.append(x_sample)
    break
    

In [121]:
with open("./../../data/training_data/base_model.json", 'w') as file_handler:
    file_handler.write(json.dumps(data))
    print(len(data))

2927


In [123]:
with open("./../../data/training_data/base_model.json", "r") as file_handler:
    data = json.loads(file_handler.read())

    df = pd.DataFrame(data)
    X = df.drop(['play_count', "track_id"], axis=1)
    Y = df['play_count']

In [129]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=101)

In [130]:
print(f"X_train size: {X_train.size} \n Y_train size {Y_train.size}")
print(X_train)
print(Y_train)

X_train size: 9364 
 Y_train size 2341
      play_count_week_1  play_count_week_2  play_count_week_3  \
970                   0                  1                  3   
1347                  1                  2                  0   
1418                  2                  2                  1   
1567                  1                  1                  1   
2649                  1                  2                  1   
...                 ...                ...                ...   
599                   3                  4                  0   
1599                  0                  1                  2   
1361                  1                  3                  3   
1547                  1                  2                  1   
863                   2                  0                  1   

      play_count_week_4  
970                   0  
1347                  0  
1418                  2  
1567                  1  
2649                  1  
...                 ... 

### Model bazowy

In [131]:
model = LinearRegression()
model.fit(X_train, Y_train)

In [132]:
dump(model, filename="./base_model.joblib")

['./base_model.joblib']