In [12]:
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine(
        "postgresql://robot-startml-ro:pheiph0hahj1Vaif@"
        "postgres.lab.karpov.courses:6432/startml"
    )

# Чтение данных

In [13]:
# Чтение данных таблицы user_data
query = "SELECT * FROM user_data"
user_data = pd.read_sql(query, engine)
print(f"User data shape: {user_data.shape}")

# Чтение данных таблицы post_text_df
query = "SELECT * FROM post_text_df"
post_text_df = pd.read_sql(query, engine)
print(f"Post text data shape: {post_text_df.shape}")

User data shape: (163205, 8)
Post text data shape: (7023, 3)


In [14]:
import numpy as np
import re
from string import punctuation

def word_count(X):
    return np.array([len(re.findall(r'\b\w+\b', text)) for text in X])

def sentence_count(X):
    return np.array([len(re.findall(r'[.!?]+', text)) for text in X])

def avg_word_length(X):
    return np.array([sum(len(word) for word in re.findall(r'\b\w+\b', text)) / len(re.findall(r'\b\w+\b', text)) if len(re.findall(r'\b\w+\b', text)) > 0 else 0 for text in X])

def punctuation_count(X):
    return np.array([sum(1 for char in text if char in punctuation) for text in X])

# Apply the feature extraction functions to the 'text' column
word_counts = word_count(post_text_df['text'])
sentence_counts = sentence_count(post_text_df['text'])
avg_word_lengths = avg_word_length(post_text_df['text'])
punctuation_counts = punctuation_count(post_text_df['text'])

# Add the new features as columns in the user_dataFrame
post_text_df['word_count'] = word_counts
post_text_df['sentence_count'] = sentence_counts
post_text_df['avg_word_length'] = avg_word_lengths
post_text_df['punctuation_count'] = punctuation_counts

from sklearn.preprocessing import LabelEncoder

le_gender = LabelEncoder()
le_os = LabelEncoder()
le_source = LabelEncoder()
le_action = LabelEncoder()

# Label encoding for 'gender', 'os', and 'source'
user_data['gender'] = le_gender.fit_transform(user_data['gender'])
user_data['os'] = le_os.fit_transform(user_data['os'])
user_data['source'] = le_source.fit_transform(user_data['source'])
post_text_df['topic'] = le_action.fit_transform(post_text_df['topic'])
user_data['country'] = le_action.fit_transform(user_data['country'])
user_data['city'] = le_action.fit_transform(user_data['city'])

# Additional text features

In [15]:
# Чтение feed_data из csv файла
feed_data = pd.read_csv(r'C:\Users\Alex\Desktop\feed_data.csv')
print(f"Feed data shape: {feed_data.shape}")

Feed data shape: (76892800, 5)


In [16]:
feed_data.head()

Unnamed: 0,timestamp,user_id,post_id,action,target
0,2021-12-26 16:40:07,52431,6920,view,0
1,2021-12-26 16:42:42,52431,3840,view,0
2,2021-12-26 16:45:27,52431,1716,view,0
3,2021-12-26 16:46:39,52431,1054,view,0
4,2021-12-26 16:48:54,52431,963,view,0


## Объединение файлов

In [17]:
# Переименование столбцов идентификаторов
user_data = user_data.rename(columns={'id': 'user_id'})
post_text_df = post_text_df.rename(columns={'id': 'post_id'})

# Объединение таблиц
data = feed_data.merge(user_data, on='user_id', how='left')
large_data = data.merge(post_text_df, on='post_id', how='left')

print(f"Data shape after load_and_merge_data: {data.shape}")
print(data.columns)

Data shape after load_and_merge_data: (76892800, 12)
Index(['timestamp', 'user_id', 'post_id', 'action', 'target', 'gender', 'age',
       'country', 'city', 'exp_group', 'os', 'source'],
      dtype='object')


### Обрезание датафрейма

In [18]:
def sample_10_percent(group):
    frac = 0.3
    return group.sample(frac=frac)

data = large_data.groupby('user_id', group_keys=False).apply(sample_10_percent)
print(f"Sampled data shape: {data.shape}")

Sampled data shape: (30757158, 18)


In [19]:
# delete the large_data
large_data = None

# Обработка данных

## User-based

### Обработка временных меток

In [20]:
# Преобразование формата временных меток в объект datetime
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Извлечение признаков из временных меток
data['month'] = data['timestamp'].dt.month
data['day_of_week'] = data['timestamp'].dt.dayofweek
data['hour_of_day'] = data['timestamp'].dt.hour

# Расчет времени с момента последнего действия для каждого пользователя
data = data.sort_values(['user_id', 'timestamp'])
data['time_since_last_action'] = data.groupby('user_id')['timestamp'].diff().dt.total_seconds()
data['time_since_last_action'].fillna(0, inplace=True)

# Extracting day of the month and year from the timestamp
data['day_of_month'] = data['timestamp'].dt.day
data['year'] = data['timestamp'].dt.year

# Удаление столбца временных меток
data = data.drop('timestamp', axis=1)

print('Timestamps processed')
print(f"Data shape after timestamps processing: {data.shape}")

Timestamps processed
Data shape after timestamps processing: (30757158, 23)


### Создание дополнительных признаков

In [21]:
# Feature 1: Количество просмотров и лайков для каждого пользователя
user_views_likes = data.groupby('user_id')['action'].value_counts().unstack().fillna(0)
user_views_likes.columns = ['user_views', 'user_likes']
data = data.merge(user_views_likes, on='user_id', how='left')

# Преобразование категориальных признаков в строковый формат
categorical_columns = ['country', 'city', 'topic', 'gender', 'os', 'source']
data[categorical_columns] = data[categorical_columns].astype(str)

print('Additional features created')
print(f"Data shape after additional features creation: {data.shape}")

Additional features created
Data shape after additional features creation: (30757158, 25)


### Наименование user-based признаков

In [22]:
print(data.columns)

Index(['user_id', 'post_id', 'action', 'target', 'gender', 'age', 'country',
       'city', 'exp_group', 'os', 'source', 'text', 'topic', 'word_count',
       'sentence_count', 'avg_word_length', 'punctuation_count', 'month',
       'day_of_week', 'hour_of_day', 'time_since_last_action', 'day_of_month',
       'year', 'user_views', 'user_likes'],
      dtype='object')


### Группировка юзеров с помощью KMeans

## Content-based

### Созранение дополнительных признаоков

In [23]:
# Feature 2: Количество просмотров и лайков для каждого поста
post_views_likes = data.groupby('post_id')['action'].value_counts().unstack().fillna(0)
post_views_likes.columns = ['post_views', 'post_likes']
data = data.merge(post_views_likes, on='post_id', how='left')

# Feature 3: Количество просмотров и лайков для каждой группы тематик
temp_df = data[['exp_group', 'topic', 'action']]

# Создание колонок с количеством просмотров и лайков для каждой темы внутри группы
topic_action_count = temp_df.pivot_table(index='exp_group', columns=['topic', 'action'], aggfunc=len, fill_value=0)
topic_action_count.columns = [f'{col[0]}_exp_group_{col[1]}s' for col in topic_action_count.columns]
grouped_data = topic_action_count.reset_index()

data = data.merge(grouped_data, on='exp_group', how='left')

print('Additional features created')
print(f"Data shape after additional features creation: {data.shape}")

Additional features created
Data shape after additional features creation: (30757158, 41)


### Truncated Singular Value Decomposition (Truncated SVD), also known as Latent Semantic Analysis (LSA).

In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Take only 'text' and 'target' columns
df = data[['text', 'target']]

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['text'])

# Initialize SVD
n_components = 10
svd = TruncatedSVD(n_components=n_components, random_state=42)
reduced_tfidf_matrix = svd.fit_transform(tfidf_matrix)

reduced_df = pd.DataFrame(reduced_tfidf_matrix, columns=[f'component_{i}' for i in range(1, n_components+1)])
reduced_df['target'] = df['target']


MemoryError: 

In [None]:
reduced_df.shape

(15378483, 11)

In [None]:
reduced_df.head()

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,target
0,-0.21958,-0.017638,-0.020689,0.011861,-0.008848,0.04956,0.039237,-0.013185,0.003749,0.035966,0
1,-0.064687,-0.047597,-0.052248,0.101186,-0.069884,-0.042457,0.015479,-0.021705,-0.011001,-0.03019,0
2,-0.405443,-0.234049,-0.155049,-0.017235,0.087969,-0.060522,-0.073545,0.031013,0.042862,0.043258,0
3,-0.137285,-0.033293,-0.069082,0.106621,-0.06879,0.038433,-0.007784,-0.008507,-0.033078,-0.036161,0
4,-0.425986,-0.048044,0.023365,-0.045733,-0.138464,0.005685,0.049314,-0.041455,-0.039026,-0.057057,0


In [None]:
reduced_df['post_id'] = data['post_id']

In [None]:
# Merge reduced_df with data
reduced_df.drop('target', axis=1, inplace=True)
data = data.merge(reduced_df, left_index=True, right_index=True, how='left')
data.head()

Unnamed: 0,user_id,post_id_x,action,target,gender,age,country,city,exp_group,os,...,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,post_id_y
0,200,5057,view,0,1,34,7,651,3,0,...,-0.017638,-0.020689,0.011861,-0.008848,0.04956,0.039237,-0.013185,0.003749,0.035966,5057
1,200,2489,view,0,1,34,7,651,3,0,...,-0.047597,-0.052248,0.101186,-0.069884,-0.042457,0.015479,-0.021705,-0.011001,-0.03019,2489
2,200,1244,view,0,1,34,7,651,3,0,...,-0.234049,-0.155049,-0.017235,0.087969,-0.060522,-0.073545,0.031013,0.042862,0.043258,1244
3,200,3263,view,0,1,34,7,651,3,0,...,-0.033293,-0.069082,0.106621,-0.06879,0.038433,-0.007784,-0.008507,-0.033078,-0.036161,3263
4,200,668,like,0,1,34,7,651,3,0,...,-0.048044,0.023365,-0.045733,-0.138464,0.005685,0.049314,-0.041455,-0.039026,-0.057057,668


In [None]:
# print data columns
print(data.columns)

Index(['user_id', 'post_id_x', 'action', 'target', 'gender', 'age', 'country',
       'city', 'exp_group', 'os', 'source', 'text', 'topic', 'word_count',
       'sentence_count', 'avg_word_length', 'punctuation_count', 'month',
       'day_of_week', 'hour_of_day', 'time_since_last_action', 'day_of_month',
       'year', 'user_views', 'user_likes', 'post_views', 'post_likes',
       '0_exp_group_likes', '0_exp_group_views', '1_exp_group_likes',
       '1_exp_group_views', '2_exp_group_likes', '2_exp_group_views',
       '3_exp_group_likes', '3_exp_group_views', '4_exp_group_likes',
       '4_exp_group_views', '5_exp_group_likes', '5_exp_group_views',
       '6_exp_group_likes', '6_exp_group_views', 'component_1', 'component_2',
       'component_3', 'component_4', 'component_5', 'component_6',
       'component_7', 'component_8', 'component_9', 'component_10',
       'post_id_y'],
      dtype='object')


In [None]:
# dropping the text column
data.drop('text', axis=1, inplace=True)

In [None]:
# saving the data_for_training to csv in desctop
import os

desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")
file_path = os.path.join(desktop_path, "04_data_for_training.csv")

data.to_csv(file_path, index=False)