In [10]:
import numpy as np
import pandas as pd

# Convert MovieLens data to binary using numpy_to_binary function
def movie_lens_to_binary(input_file, output_file):
    # Load MovieLens data using Pandas
    ratings = pd.read_csv(input_file, sep='\t', header=None,
                          names=['userId', 'movieId', 'rating', 'rating_timestamp'])
    # Convert to NumPy array
    np_data = np.array(ratings[['userId', 'movieId', 'rating']])
    # Write to binary file
    with open(output_file, "wb") as bin_file:
        bin_file.write(np_data.astype(np.int32).tobytes())
movie_lens_to_binary('u.data', 'output_binary.bin')

In [11]:

def binary_to_pandas(bin_file, num_rows=10):
    # Read binary data into NumPy array
    with open(bin_file, 'rb') as f:
        binary_data = f.read()

    # Convert binary data back to NumPy array
    np_data = np.frombuffer(binary_data, dtype=np.int32).reshape(-1, 3)  # Assuming 3 columns

    # Convert NumPy array to Pandas DataFrame
    df = pd.DataFrame(np_data, columns=['userId', 'movieId', 'rating'])

    # Display the equivalent of ratings.head(10)
    print(df.head(num_rows))

# Usage
binary_to_pandas('output_binary.bin', num_rows=10)


   userId  movieId  rating
0     196      242       3
1     186      302       3
2      22      377       1
3     244       51       2
4     166      346       1
5     298      474       4
6     115      265       2
7     253      465       5
8     305      451       3
9       6       86       3


In [12]:
import numpy as np
import pandas as pd

def binary_to_pandas_with_stats(bin_file, num_rows=10):
    # Read binary data into NumPy array
    with open(bin_file, 'rb') as f:
        binary_data = f.read()
    # Convert binary data back to NumPy array
    np_data = np.frombuffer(binary_data, dtype=np.int32).reshape(-1, 3)  # Assuming 3 columns
    # Convert NumPy array to Pandas DataFrame
    df = pd.DataFrame(np_data, columns=['userId', 'movieId', 'rating'])
    # Calculate max and min values for 'userId'
    userId_max = df['userId'].max()
    userId_min = df['userId'].min()
    num_rows_df = len(df.index)
    return userId_max, userId_min, num_rows_df
# Usage
userId_max, userId_min, num_rows_df = binary_to_pandas_with_stats('output_binary.bin', num_rows=10)

print(f"Maximum userId: {userId_max}")
print(f"Minimum userId: {userId_min}")
print(f"Number of rows: {num_rows_df}")

Maximum userId: 943
Minimum userId: 1
Number of rows: 100000


In [13]:
#16 seg
import numpy as np
import pandas as pd

def binary_to_pandas_with_stats(bin_file, num_rows=10):
    # Read binary data into NumPy array
    with open(bin_file, 'rb') as f:
        binary_data = f.read()
    # Convert binary data back to NumPy array
    np_data = np.frombuffer(binary_data, dtype=np.int32).reshape(-1, 3)  # Assuming 3 columns
    # Convert NumPy array to Pandas DataFrame
    df = pd.DataFrame(np_data, columns=['userId', 'movieId', 'rating'])
    return df
def consolidate_data(df):
    # Group by 'userId' and 'movieId' and calculate the mean of 'rating'
    consolidated_df = df.groupby(['userId', 'movieId'])['rating'].mean().unstack()
    return consolidated_df
df = binary_to_pandas_with_stats('output_binary.bin', num_rows=10)

# Consolidate data
consolidated_df = consolidate_data(df)
print("Consolidated data:")
print(consolidated_df)


Consolidated data:
movieId  1     2     3     4     5     6     7     8     9     10    ...  \
userId                                                               ...   
1         5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   
2         4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   2.0  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         4.0   3.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
...       ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   
939       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   5.0   NaN  ...   
940       NaN   NaN   NaN   2.0   NaN   NaN   4.0   5.0   3.0   NaN  ...   
941       5.0   NaN   NaN   NaN   NaN   NaN   4.0   NaN   NaN   NaN  ...   
942       NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
943       NaN   5.0   NaN   NaN   NaN   NaN   NaN   NaN   3.0   NaN  

In [14]:
#it takes 32 seconds
#comparate
import pandas as pd
import numpy as np
from scipy.spatial.distance import cityblock

def computeNearestNeighbor(dataframe, target_user, distance_metric=cityblock):
    distances = np.zeros(len(dataframe))  # Inicializa un array NumPy
    # Itera sobre cada fila (usuario) en el DataFrame
    for i, (index, row) in enumerate(dataframe.iterrows()):
        if index == target_user:
            continue  # Omite el propio usuario objetivo
        # Calcula la distancia entre el usuario objetivo y el usuario actual
        distance = distance_metric(dataframe.loc[target_user].fillna(0), row.fillna(0))
        distances[i] = distance
    # Obtiene los índices que ordenarían el array y luego ordena las distancias y los índices de los usuarios en consecuencia
    sorted_indices = np.argsort(distances)
    sorted_distances = distances[sorted_indices]
    return list(zip(dataframe.index[sorted_indices], sorted_distances))
# Ejemplo de uso
# Suponiendo que tu DataFrame se llama 'consolidated_df'
target_user_id = 1
neighbors = computeNearestNeighbor(consolidated_df, target_user_id)
# Imprime los vecinos más cercanos y sus distancias
print("Nearest Neighbors for User {}: {}".format(target_user_id, neighbors))

Nearest Neighbors for User 1: [(1, 0.0), (738, 826.0), (215, 863.0), (521, 865.0), (77, 872.0), (508, 881.0), (44, 887.0), (933, 887.0), (715, 895.0), (352, 897.0), (538, 897.0), (778, 898.0), (746, 902.0), (248, 906.0), (737, 907.0), (96, 909.0), (806, 910.0), (638, 913.0), (823, 917.0), (748, 918.0), (844, 922.0), (177, 924.0), (773, 924.0), (468, 929.0), (226, 930.0), (97, 930.0), (297, 930.0), (175, 930.0), (124, 930.0), (913, 933.0), (247, 934.0), (745, 934.0), (73, 936.0), (41, 936.0), (421, 937.0), (584, 937.0), (700, 937.0), (371, 939.0), (411, 939.0), (781, 941.0), (53, 941.0), (868, 942.0), (723, 944.0), (8, 944.0), (757, 945.0), (307, 945.0), (272, 945.0), (12, 946.0), (69, 946.0), (198, 947.0), (412, 947.0), (514, 947.0), (23, 947.0), (64, 949.0), (402, 949.0), (148, 950.0), (632, 950.0), (867, 951.0), (103, 951.0), (686, 952.0), (680, 952.0), (51, 952.0), (55, 953.0), (742, 954.0), (182, 954.0), (929, 955.0), (493, 955.0), (71, 956.0), (679, 957.0), (566, 957.0), (442, 957

In [15]:
import numpy as np
import pandas as pd
import redis

# Configuración de Redis
redis_host = 'localhost'
redis_port = 6379
redis_key = 'movie_ratings_queue'

# Crear un cliente de Redis
redis_client = redis.Redis(host=redis_host, port=redis_port)

def send_data_to_redis(data, redis_key):
    # Convierte los datos a una cadena y envíalos a Redis
    data_str = ",".join(map(str, data.flatten()))
    redis_client.lpush(redis_key, data_str)

def receive_and_process_data_from_redis(redis_key, num_rows=10):
    # Espera mensajes en la cola de Redis
    while True:
        # Obtiene un mensaje de la cola
        data_str = redis_client.rpop(redis_key)
        if data_str is not None:
            # Convierte la cadena a un array NumPy
            data = np.fromstring(data_str.decode('utf-8'), dtype=np.int32, sep=',').reshape(-1, 3)
            df = pd.DataFrame(data, columns=['userId', 'movieId', 'rating'])
            print("Received data:")
            print(df.head(num_rows))

# Ejemplo de uso: Enviar datos a Redis
data_to_send = np.array([[1, 101, 5], [2, 102, 4], [3, 103, 3]])
send_data_to_redis(data_to_send, redis_key)

# Ejemplo de uso: Recibir y procesar datos desde Redis
receive_and_process_data_from_redis(redis_key)


In [None]:
import numpy as np
import pandas as pd
import redis

def receive_and_process_data_from_redis(redis_key, num_rows=10):
    # Configuración de Redis
    redis_host = 'localhost'
    redis_port = 6379

    # Crear un cliente de Redis
    redis_client = redis.Redis(host=redis_host, port=redis_port)

    # Espera mensajes en la cola de Redis
    while True:
        # Obtiene un mensaje de la cola
        data_str = redis_client.rpop(redis_key)
        if data_str is not None:
            # Convierte la cadena a un array NumPy
            data = np.fromstring(data_str.decode('utf-8'), dtype=np.int32, sep=',').reshape(-1, 3)
            df = pd.DataFrame(data, columns=['userId', 'movieId', 'rating'])
            print("Received data:")
            print(df.head(num_rows))

if __name__ == "__main__":
    # Clave de la cola en Redis
    redis_key = 'movie_ratings_queue'
    
    # Ejemplo de uso: Recibir y procesar datos desde Redis
    receive_and_process_data_from_redis(redis_key)
