# PageRank

## Обработка данных

In [1]:
%pip install scipy
%pip install numpy
%pip install pandas
%pip install networkx
%pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.model_selection import train_test_split
from typing import Tuple, Mapping, Any

In [3]:
df = pd.read_csv("res\\rating.csv")

In [4]:
df['rating'].describe()

count    2.000026e+07
mean     3.525529e+00
std      1.051989e+00
min      5.000000e-01
25%      3.000000e+00
50%      3.500000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [5]:
def split_by_target_column_value(df: pd.DataFrame, test_size: float, target_column: str, target_value: Any,
                                 random_state: int = 42) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """
    Разделяет DataFrame на обучающий и тестовый наборы на основе определенного значения целевого столбца.
    Выдает в качестве тестового набора test_size от набора, в котором значение в target_column == target_value
    В тренировочном все остальное
    Параметры:
        df (pd.DataFrame): DataFrame для разделения.
        test_size: Пропорция данных, которые следует включить в тестовый набор.
        target_column (str): Название целевого столбца для разделения.
        target_value (Any): Значение целевого столбца, используемое для разделения.
        random_state (int): Зерно для генератора случайных чисел.

    Возвращает:
        Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]: Данные для обучения; тестовые данные для целевой колонки, которые не попали в data,
        тренировочные данные для целевой колоники, которые попали в data
    """
    grouped = df.groupby(by=target_column)
    train_df, test_df = train_test_split(grouped.get_group(target_value), test_size=test_size,
                                         random_state=random_state)
    data = pd.concat([group for ind, group in grouped if ind != target_value], ignore_index=True)
    data = pd.concat([train_df, data], ignore_index=True)
    return data, test_df, train_df



In [6]:
data, test1, train1 = split_by_target_column_value(df, test_size=0.5, target_column='userId', target_value=1)

## Реализация PageRank

In [7]:
def get_range(data: pd.DataFrame, personalization: Mapping,
               source: str, target: str, edge_attr: str) -> pd.DataFrame:
    """
    Вычисляет PageRank на основе заданного DataFrame и возвращает DataFrame с результатами.
    В возвращаемых значении присутствуют данные из personalization
    Параметры:
        data (pd.DataFrame): DataFrame для вычисления PageRank.
        personalization (Mapping): Словарь с персонализацией для PageRank.
        source (str): Название столбца, содержащего исходные вершины ребер.
        target (str): Название столбца, содержащего целевые вершины ребер.
        edge_attr (str): Название столбца, содержащего атрибуты ребер.

    Возвращает:
        pd.DataFrame: DataFrame с результатами вычисления PageRank.
    """
    G = nx.Graph()

    # Создаем список ребер для добавления
    edges_to_add = [((row[source], 'source'), (row[target], 'target'), {'weight': row[edge_attr]}) for _, row in data.iterrows()]

    # Добавляем все ребра скопом
    G.add_edges_from(edges_to_add)
    
    # Преобразование персонализации в формат, подходящий для PageRank
    new_personalization = {(k, 'source'): v for k, v in personalization.items()}
    
    # Вычисление PageRank
    pagerank = nx.pagerank(G, personalization=new_personalization)
    
    # Формирование DataFrame с результатами PageRank
    target_rank_generator = ((node[0], rank) for node, rank in pagerank.items() if node[1] == 'target')
    return pd.DataFrame(target_rank_generator, columns=[target, 'pagerank'])


In [8]:
# Создаем тестовый DataFrame
test_df = pd.DataFrame({
    'source': [1, 2, 2, 2],
    'target': [2, 2, 3, 1],
    'edge_attr': [5, 5, 5, 1]
})

# Создаем тестовую персонализацию
personalization = {1: 1}

prediction_test = get_range(test_df, personalization=personalization, source='source', target='target',
                                            edge_attr='edge_attr')
prediction_test

Unnamed: 0,target,pagerank
0,2,0.346707
1,3,0.093959
2,1,0.018792


In [9]:
prediction_movie_for_user1 = get_range(data=data, personalization={1: 1}, source='userId', target='movieId',
                                            edge_attr='rating')


In [10]:
prediction_movie_for_user1

Unnamed: 0,movieId,pagerank
0,2288,1.772472e-03
1,2648,1.458741e-03
2,1222,1.858296e-03
3,3037,1.466749e-03
4,2683,1.792195e-03
...,...,...
26739,121017,1.565996e-08
26740,121019,2.013423e-08
26741,121021,2.013423e-08
26742,110167,2.216420e-08


In [11]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2288,4.0,2004-09-10 03:14:37
1,1,2648,3.5,2004-09-10 03:13:37
2,1,1222,3.5,2005-04-02 23:30:37
3,1,3037,3.5,2005-04-02 23:46:03
4,1,2683,3.5,2004-09-10 03:07:30


In [36]:
prediction_movie_for_user1_without_famous = prediction_movie_for_user1[~(prediction_movie_for_user1['movieId'].isin(train1['movieId']))]
prediction_movie_for_user1_without_famous

Unnamed: 0,movieId,pagerank
87,3,1.789835e-04
88,62,3.185775e-04
89,70,2.256903e-04
90,110,1.069741e-03
91,242,1.748893e-05
...,...,...
26739,121017,1.565996e-08
26740,121019,2.013423e-08
26741,121021,2.013423e-08
26742,110167,2.216420e-08


In [37]:
pred = prediction_movie_for_user1_without_famous.sort_values(by='pagerank', ascending=False).head(len(test1))


In [38]:
def accuracy_without_order(true_labels, predicted_labels):
    # Преобразование списков в множества
    true_set = set(true_labels)
    pred_set = set(predicted_labels)
    
    # Проверка на равенство множеств
    accuracy = len(true_set.intersection(pred_set)) / len(true_set.union(pred_set))
    
    return accuracy

In [39]:
pred

Unnamed: 0,movieId,pagerank
592,296,0.001383
153,318,0.001371
305,356,0.001286
92,260,0.001196
263,2571,0.001149
...,...,...
814,7361,0.000479
382,357,0.000475
341,587,0.000474
301,10,0.000472


In [40]:
test1['movieId']

156    7045
145    6242
101    3081
127    4911
141    5898
       ... 
125    4878
25     1089
23     1079
142    5952
81     2542
Name: movieId, Length: 88, dtype: int64

In [41]:
pred['movieId']

592     296
153     318
305     356
92      260
263    2571
       ... 
814    7361
382     357
341     587
301      10
573    4963
Name: movieId, Length: 88, dtype: int64

In [42]:
print(accuracy_without_order(test1['movieId'], pred['movieId']))

0.17333333333333334


In [43]:
movie_df = pd.read_csv('res/movie.csv')

In [44]:
prediction_moviename = pd.merge(movie_df, pred, on='movieId', how='inner')
prediction_moviename.sort_values(by='pagerank', ascending=False).head()

Unnamed: 0,movieId,title,genres,pagerank
11,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,0.001383
13,318,"Shawshank Redemption, The (1994)",Crime|Drama,0.001371
16,356,Forrest Gump (1994),Comedy|Drama|Romance|War,0.001286
9,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,0.001196
71,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,0.001149


In [45]:
test_moviename = pd.merge(movie_df, test1, on='movieId', how='inner')
test_moviename.sort_values(by='rating', ascending=False).head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
73,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,1,5.0,2005-04-02 23:30:19
87,8636,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX,1,4.5,2005-04-02 23:44:53
21,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,1,4.5,2005-04-02 23:30:24
20,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,1,4.5,2005-04-02 23:32:22
54,3499,Misery (1990),Drama|Horror|Thriller,1,4.0,2005-04-02 23:35:18
