## Introduction
Greetings from the Kaggle bot! This is an automatically-generated kernel with starter code demonstrating how to read in the data and begin exploring. Click the blue "Edit Notebook" or "Fork Notebook" button at the top of this kernel to begin editing.

## Exploratory Analysis
To begin this exploratory analysis, first use `matplotlib` to import libraries and define functions for plotting the data. Depending on the data, not all plots will be made. (Hey, I'm just a kerneling bot, not a Kaggle Competitions Grandmaster!)

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import roc_curve, auc, precision_recall_curve, precision_score, recall_score

# Load datasets
checkins = pd.read_csv('/kaggle/input/NY_Restauraunts_checkins.csv', delimiter=',', encoding='ISO-8859-1')
tips = pd.read_csv('/kaggle/input/NY_Restauraunts_tips.csv', delimiter=',', encoding='ISO-8859-1')
tags = pd.read_csv('/kaggle/input/NY_Restauraunts_tags.csv', delimiter=',', encoding='ISO-8859-1')

# Merge datasets
merged_data = pd.merge(checkins, tips, on=['user_ID', 'venue_ID'], how='outer')
merged_data = pd.merge(merged_data, tags, on='venue_ID', how='outer')

# Create feature vectors
features = merged_data.groupby('venue_ID').size().reset_index(name='checkin_count')

# Split dataset into training and testing sets
train, test = train_test_split(features, test_size=0.2, random_state=42)

# Fit k-NN model
knn = NearestNeighbors(n_neighbors=5)  # You can adjust the number of neighbors
knn.fit(train[['checkin_count']])

# Find nearest neighbors for test instances
distances, indices = knn.kneighbors(test[['checkin_count']])

# Evaluate precision and recall using ROC curve
# Assuming a binary classification problem (e.g., recommending or not recommending a venue)
# You need to define your own threshold for recommendation based on the problem requirements

# Example threshold, adjust according to your problem
threshold = 30


# Convert distances to binary predictions
predictions = (distances < threshold).astype(int)

# Calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(test['checkin_count'] >= threshold, predictions.sum(axis=1) > 0)

# Choose a higher precision threshold
target_precision = 3  # Adjust as needed

# Find the index of the closest precision to the target
best_threshold_index = next((i for i, p in enumerate(precision) if p >= target_precision), len(precision) - 1)

# Use the threshold that achieves the desired precision or the maximum available threshold
best_threshold = thresholds[best_threshold_index] if best_threshold_index < len(thresholds) else thresholds[-1]

# Recalculate precision and recall with the best threshold
updated_predictions = (distances < best_threshold).astype(int)
updated_precision = precision_score(test['checkin_count'] >= threshold, updated_predictions.sum(axis=1) > 0)
updated_recall = recall_score(test['checkin_count'] >= threshold, updated_predictions.sum(axis=1) > 0)

print(f'Updated Precision: {updated_precision:.4f}')
print(f'Updated Recall: {updated_recall:.4f}')
print(f'Used Threshold: {best_threshold:.4f}')


There are 3 csv files in the current version of the dataset:


In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


The next hidden code cells define functions for plotting data. Click on the "Code" button in the published kernel to reveal the hidden code.

In [None]:
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()


In [None]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()


In [None]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()


Now you're ready to read in the data and use the plotting functions to visualize the data.

### Let's check 1st file: /kaggle/input/NY_Restauraunts_checkins.csv

In [None]:
nRowsRead = 1000 # specify 'None' if want to read whole file
# NY_Restauraunts_checkins.csv has 27149 rows in reality, but we are only loading/previewing the first 1000 rows
df1 = pd.read_csv('/kaggle/input/NY_Restauraunts_checkins.csv', delimiter=',', nrows = nRowsRead)
df1.dataframeName = 'NY_Restauraunts_checkins.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

Let's take a quick look at what the data looks like:

In [None]:
df1.head(5)

Distribution graphs (histogram/bar graph) of sampled columns:

In [None]:
plotPerColumnDistribution(df1, 10, 5)

Correlation matrix:

In [None]:
plotCorrelationMatrix(df1, 8)

Scatter and density plots:

In [None]:
plotScatterMatrix(df1, 6, 15)

### Let's check 2nd file: /kaggle/input/NY_Restauraunts_tags.csv

In [None]:
nRowsRead = 1000

# NY_Restauraunts_tags.csv has 3298 rows in reality, but we are only loading/previewing the first 1000 rows
df2 = pd.read_csv('/kaggle/input/NY_Restauraunts_tags.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)
df2.dataframeName = 'NY_Restauraunts_tags.csv'

nRow, nCol = df2.shape
print(f'There are {nRow} rows and {nCol} columns')

Let's take a quick look at what the data looks like:

In [None]:
df2.head(5)

Distribution graphs (histogram/bar graph) of sampled columns:

In [None]:
plotPerColumnDistribution(df2, 10, 5)

In [None]:
plotCorrelationMatrix(df2, 8)

In [None]:
plotScatterMatrix(df2, 6, 15)

### Let's check 3rd file: /kaggle/input/NY_Restauraunts_tips.csv

In [None]:
nRowsRead = 1000 # specify 'None' if want to read whole file
# NY_Restauraunts_tips.csv has 10377 rows in reality, but we are only loading/previewing the first 1000 rows
df3 = pd.read_csv('/kaggle/input/NY_Restauraunts_tips.csv', delimiter=',', encoding='ISO-8859-1',nrows = nRowsRead)
df3.dataframeName = 'NY_Restauraunts_tips.csv'
nRow, nCol = df3.shape
print(f'There are {nRow} rows and {nCol} columns')

Let's take a quick look at what the data looks like:

In [None]:
df3.head(5)

Distribution graphs (histogram/bar graph) of sampled columns:

In [None]:
plotPerColumnDistribution(df3, 10, 5)

Correlation matrix:

In [None]:
plotCorrelationMatrix(df3, 8)

Scatter and density plots:

In [None]:
plotScatterMatrix(df3, 6, 15)

## Conclusion
This concludes your starter analysis! To go forward from here, click the blue "Edit Notebook" button at the top of the kernel. This will create a copy of the code and environment for you to edit. Delete, modify, and add code as you please. Happy Kaggling!

In [None]:
print(df1.columns)


In [None]:
print(df2.columns)


In [None]:
print(df3.columns)


In [None]:
# Assuming the actual column name is different, replace 'checkins' with the correct column name
df1['score'] = df1['venue_ID']

# Alternatively, you can use other aggregation functions like mean or sum, depending on your specific goal
# For example, calculating the mean check-ins per restaurant:
# df1['score'] = df1.groupby('restaurant_id')['actual_column_name'].transform('mean')

# Display the updated DataFrame
print(df1[['user_ID', 'venue_ID', 'score']].head())

# Plot the distribution of the calculated scores
plotPerColumnDistribution(df1[['score']], 10, 1)


In [None]:
# Plot Common Neighbors
plt.figure(figsize=(10, 6))
plt.scatter(df_common_neighbors['common_neighbors_column'], df_common_neighbors['preferential_attachment_column'], alpha=0.5)
plt.title('Common Neighbors vs Preferential Attachment')
plt.xlabel('Common Neighbors')
plt.ylabel('Preferential Attachment')
plt.grid(True)
plt.show()


In [None]:
# Print column names of df_common_neighbors
print(df_common_neighbors.columns)


In [None]:
# Calculate Preferential Attachment for each edge
preferential_attachment = [(u, v, G.degree(u) * G.degree(v)) for u, v in G.edges()]
df_preferential_attachment = pd.DataFrame(preferential_attachment, columns=['venue1', 'venue2', 'preferential_attachment'])


In [None]:
# Plot Common Neighbors vs Preferential Attachment
plt.figure(figsize=(10, 6))
plt.scatter(df_common_neighbors['common_neighbors'], df_preferential_attachment['preferential_attachment'], alpha=0.5)
plt.title('Common Neighbors vs Preferential Attachment')
plt.xlabel('Common Neighbors')
plt.ylabel('Preferential Attachment')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Assuming df_common_neighbors, df_preferential_attachment, clustering_coefficient, degree_centrality, communities, and pagerank are calculated

# Plot Common Neighbors vs Preferential Attachment
plt.figure(figsize=(10, 6))
plt.scatter(df_preferential_attachment['venue1'], df_preferential_attachment['preferential_attachment'], alpha=0.5)
plt.title('Common Neighbors vs Preferential Attachment')
plt.xlabel('Common Neighbors')
plt.ylabel('Preferential Attachment')
plt.grid(True)
plt.show()

# Plot Clustering Coefficient
plt.figure(figsize=(10, 6))
plt.bar(clustering_coefficient.keys(), clustering_coefficient.values())
plt.title('Clustering Coefficient for Venues')
plt.xlabel('Venue ID')
plt.ylabel('Clustering Coefficient')
plt.xticks(rotation=90)
plt.show()

# Plot Degree Centrality
plt.figure(figsize=(10, 6))
plt.bar(degree_centrality.keys(), degree_centrality.values())
plt.title('Degree Centrality for Venues')
plt.xlabel('Venue ID')
plt.ylabel('Degree Centrality')
plt.xticks(rotation=90)
plt.show()

# Plot Community Detection
colors = ['red', 'green', 'blue', 'yellow', 'purple']  # Add more colors if needed
node_color = [colors[i] for i, community in enumerate(communities) for _ in community]

plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G)
nx.draw(G, pos, node_color=node_color, with_labels=True)
plt.title('Community Detection')
plt.show()

# Plot PageRank scores
# Plot Common Neighbors vs Preferential Attachment
plt.figure(figsize=(10, 6))
plt.scatter(df_preferential_attachment['venue1'], df_preferential_attachment['preferential_attachment'], alpha=0.5)
plt.title('Common Neighbors vs Preferential Attachment')
plt.xlabel('Common Neighbors')
plt.ylabel('Preferential Attachment')
plt.grid(True)
plt.show()

# Plot Community Detection
num_communities = len(communities)
colors = plt.cm.viridis.colors  # You can choose a different colormap if needed

node_color = [colors[i % len(colors)] for i, community in enumerate(communities) for _ in community]

plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G)
nx.draw(G, pos, node_color=node_color, with_labels=True)
plt.title('Community Detection')
plt.show()

# Link Prediction visualization may vary based on the specific algorithm used
# Additional visualizations can be added based on your requirements


In [None]:
pip install matplotlib networkx python-louvain


In [None]:
# Assuming checkins_data is a DataFrame with columns: 'user_id', 'venue_id', 'timestamp'
checkins_data['timestamp'] = pd.to_datetime(checkins_data['timestamp'])

# Extract temporal features
checkins_data['hour'] = checkins_data['timestamp'].dt.hour
checkins_data['day_of_week'] = checkins_data['timestamp'].dt.dayofweek
checkins_data['month'] = checkins_data['timestamp'].dt.month


In [None]:
import matplotlib.pyplot as plt
plt.rcParams['text.usetex'] = False


In [None]:
import matplotlib.pyplot as plt

text_to_display = "waiters!!!,socialite, exclusive, $$$$, jetsetter,sushi,zagat rated"

fig, ax = plt.subplots()
ax.annotate(text_to_display, xy=(0.5, 0.5), xycoords='axes fraction',
            fontsize=12, ha='center', va='center', fontfamily='monospace')

# Save the figure or display it as needed
plt.show()


In [None]:
import pandas as pd
from wordcloud import WordCloud
import networkx as nx
import matplotlib.pyplot as plt
plt.rcParams['text.usetex'] = False
# Load data
df_checkins = pd.read_csv('/kaggle/input/NY_Restauraunts_checkins.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)
df_tips = pd.read_csv('/kaggle/input/NY_Restauraunts_tips.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)
df_tags = pd.read_csv('/kaggle/input/NY_Restauraunts_tags.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)


# Create a graph using NetworkX
G = nx.Graph()

# Add edges for check-ins
G.add_edges_from(zip(df_checkins['user_ID'], df_checkins['venue_ID']))

# Add edges for tips
G.add_edges_from(zip(df_tips['user_ID'], df_tips['venue_ID']))

# Add edges for tags
G.add_edges_from(zip(df_tags['venue_ID'], df_tags['tags']))

# Visualize the graph
pos = nx.spring_layout(G)
nx.draw(G, pos, with_labels=True, font_weight='bold')
plt.title("User-Venue Interaction Network")
plt.show()

# Group by venue and concatenate tags
venue_tags = df_tags.groupby('venue_ID')['tags'].apply(lambda x: ' '.join(str(i) for i in x)).reset_index()

# Merge with check-ins and tips data to include user-venue interactions
venue_tags = pd.merge(venue_tags, df_checkins, on='venue_ID', how='left')
venue_tags = pd.merge(venue_tags, df_tips, on='venue_ID', how='left')

# Plot tag distribution
tag_counts = venue_tags['tags'].str.split().explode().value_counts()
tag_counts.plot(kind='bar', figsize=(10, 6), color='skyblue')
plt.title("Tag Distribution for Venues")
plt.xlabel("Tag")
plt.ylabel("Count")
plt.show()

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Load datasets
checkins_df = pd.read_csv('/kaggle/input/NY_Restauraunts_checkins.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)
tips_df = pd.read_csv('/kaggle/input/NY_Restauraunts_tips.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)
tags_df = pd.read_csv('/kaggle/input/NY_Restauraunts_tags.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)

# Create a directed graph
G = nx.DiGraph()

# Add nodes and edges from checkins
G.add_nodes_from(checkins_df['user_ID'], bipartite=0)
G.add_nodes_from(checkins_df['venue_ID'], bipartite=1)
G.add_edges_from(zip(checkins_df['user_ID'], checkins_df['venue_ID']))

# Add edges from tips
G.add_edges_from(zip(tips_df['user_ID'], tips_df['venue_ID']))

# Visualize the graph
pos = nx.spring_layout(G)
plt.figure(figsize=(10, 8))
nx.draw(G, pos, node_size=10)
plt.title('Restaurant Check-ins and Tips Network')
plt.show()

# Analysis
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())
print("Density of the graph:", nx.density(G))

# Perform link prediction algorithms (e.g., Common Neighbors, Jaccard Coefficient, Adamic-Adar Index)

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Load check-ins dataset
checkins_df = pd.read_csv('/kaggle/input/NY_Restauraunts_checkins.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)

# Create a bipartite graph
G = nx.Graph()
G.add_nodes_from(checkins_df['user_ID'], bipartite=0)
G.add_nodes_from(checkins_df['venue_ID'], bipartite=1)
G.add_edges_from(zip(checkins_df['user_ID'], checkins_df['venue_ID']))

# Plot the degree distribution of user nodes
user_degrees = [deg for node, deg in G.degree() if G.nodes[node]['bipartite'] == 0]
plt.hist(user_degrees, bins=50, alpha=0.5, color='b', label='Users')
plt.title('Degree Distribution of User Nodes')
plt.xlabel('Degree')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Plot the degree distribution of venue nodes
venue_degrees = [deg for node, deg in G.degree() if G.nodes[node]['bipartite'] == 1]
plt.hist(venue_degrees, bins=50, alpha=0.5, color='r', label='Venues')
plt.title('Degree Distribution of Venue Nodes')
plt.xlabel('Degree')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Identify the largest weakly connected component
largest_wcc = max(nx.connected_components(G), key=len)
G_largest_wcc = G.subgraph(largest_wcc)

# Set a degree threshold (replace with your desired threshold)
degree_threshold = 2

# Remove nodes with a degree less than the threshold
filtered_nodes = [node for node in G_largest_wcc.nodes if G_largest_wcc.degree(node) >= degree_threshold]
G_filtered = G_largest_wcc.subgraph(filtered_nodes)

# Plot the degree distribution of filtered user nodes
user_degrees_filtered = [deg for node, deg in G_filtered.degree() if G_filtered.nodes[node]['bipartite'] == 0]
plt.hist(user_degrees_filtered, bins=50, alpha=0.5, color='b', label='Users')
plt.title('Filtered Degree Distribution of User Nodes')
plt.xlabel('Degree')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Plot the degree distribution of filtered venue nodes
venue_degrees_filtered = [deg for node, deg in G_filtered.degree() if G_filtered.nodes[node]['bipartite'] == 1]
plt.hist(venue_degrees_filtered, bins=50, alpha=0.5, color='r', label='Venues')
plt.title('Filtered Degree Distribution of Venue Nodes')
plt.xlabel('Degree')
plt.ylabel('Frequency')
plt.legend()
plt.show()


In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Load your dataset
# Assume you have loaded NY_Restauraunts_checkins.csv, NY_Restauraunts_tips.csv, NY_Restauraunts_tags.csv
checkins_data = pd.read_csv('/kaggle/input/NY_Restauraunts_checkins.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)
tips_data = pd.read_csv('/kaggle/input/NY_Restauraunts_tips.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)
tags_data = pd.read_csv('/kaggle/input/NY_Restauraunts_tags.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)

# Merge dataframes based on venue_id
merged_data = pd.merge(checkins_data, tips_data, on=['user_ID', 'venue_ID'], how='outer')
merged_data = pd.merge(merged_data, tags_data, on='venue_ID', how='outer')

# Assuming you have a target variable (e.g., whether a user liked the venue or not)
# You can create a binary target variable based on your specific problem

# For example, let's assume the target is binary: 1 if the user liked the venue, 0 otherwise
merged_data['liked'] = merged_data['tips'].notnull().astype(int)

# Drop unnecessary columns
X = merged_data.drop(['user_ID', 'venue_ID', 'tips', 'tags', 'liked'], axis=1)
y = merged_data['liked']

# Convert text data to numerical representation using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(merged_data['tips'].fillna(''))

# Combine TF-IDF features with other features
X_combined = pd.concat([X, pd.DataFrame(X_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names())], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Initialize the kNN classifier
knn = KNeighborsClassifier(n_neighbors=3)  # You can adjust the number of neighbors

# Train the model
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Calculate precision and recall
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

# Print the results
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import roc_curve, auc, precision_recall_curve, precision_score, recall_score

# Load datasets
checkins = pd.read_csv('/kaggle/input/NY_Restauraunts_checkins.csv', delimiter=',', encoding='ISO-8859-1')
tips = pd.read_csv('/kaggle/input/NY_Restauraunts_tips.csv', delimiter=',', encoding='ISO-8859-1')
tags = pd.read_csv('/kaggle/input/NY_Restauraunts_tags.csv', delimiter=',', encoding='ISO-8859-1')

# Merge datasets
merged_data = pd.merge(checkins, tips, on=['user_ID', 'venue_ID'], how='outer')
merged_data = pd.merge(merged_data, tags, on='venue_ID', how='outer')

# Create feature vectors
features = merged_data.groupby('venue_ID').size().reset_index(name='checkin_count')

# Split dataset into training and testing sets
train, test = train_test_split(features, test_size=0.2, random_state=42)

# Fit k-NN model
knn = NearestNeighbors(n_neighbors=5)  # You can adjust the number of neighbors
knn.fit(train[['checkin_count']])

# Find nearest neighbors for test instances
distances, indices = knn.kneighbors(test[['checkin_count']])

# Evaluate precision and recall using ROC curve
# Assuming a binary classification problem (e.g., recommending or not recommending a venue)
# You need to define your own threshold for recommendation based on the problem requirements

# Example threshold, adjust according to your problem
threshold = 55


# Convert distances to binary predictions
predictions = (distances < threshold).astype(int)

# Calculate precision-recall curve
precision, recall, thresholds = precision_recall_curve(test['checkin_count'] >= threshold, predictions.sum(axis=1) > 0)

# Choose a higher precision threshold
target_precision = 0.3  # Adjust as needed

# Find the index of the closest precision to the target
best_threshold_index = next((i for i, p in enumerate(precision) if p >= target_precision), len(precision) - 1)

# Use the threshold that achieves the desired precision or the maximum available threshold
best_threshold = thresholds[best_threshold_index] if best_threshold_index < len(thresholds) else thresholds[-1]

# Recalculate precision and recall with the best threshold
updated_predictions = (distances < best_threshold).astype(int)
updated_precision = precision_score(test['checkin_count'] >= threshold, updated_predictions.sum(axis=1) > 0)
updated_recall = recall_score(test['checkin_count'] >= threshold, updated_predictions.sum(axis=1) > 0)

print(f'Updated Precision: {updated_precision:.4f}')
print(f'Updated Recall: {updated_recall:.4f}')
print(f'Used Threshold: {best_threshold:.4f}')


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

# Load your dataset
# Assume you have loaded NY_Restaurants_checkins.csv, NY_Restaurants_tips.csv, NY_Restaurants_tags.csv
checkins_data = pd.read_csv('/kaggle/input/NY_Restauraunts_checkins.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)
tips_data = pd.read_csv('/kaggle/input/NY_Restauraunts_tips.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)
tags_data = pd.read_csv('/kaggle/input/NY_Restauraunts_tags.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)

# Merge dataframes based on common columns
# Convert 'user_ID' and 'venue_ID' columns to strings
merged_data['user_ID'] = merged_data['user_ID'].astype(str)
merged_data['venue_ID'] = merged_data['venue_ID'].astype(str)

# Concatenate 'user_ID' and 'venue_ID' with an underscore
y = merged_data['user_ID'] + '_' + merged_data['venue_ID']

# Fill NaN values with appropriate placeholders or drop them based on your requirement
# ...

# Convert 'venue_ID' column to strings
merged_data['venue_ID'] = merged_data['venue_ID'].astype(str)

# Handle NaN values in 'tips' column
merged_data['tips'] = merged_data['tips'].fillna('')  # Replace NaN with an empty string

# Feature extraction
vectorizer = TfidfVectorizer(stop_words='english', min_df=2)  # Ignore common English stop words and consider words that appear in at least 2 documents
mlb = MultiLabelBinarizer()

X_text = vectorizer.fit_transform(merged_data['tips'])
X_tags = mlb.fit_transform(merged_data['tips'].apply(lambda x: x.split(',')))

# Concatenate text features and tag features
X = pd.concat([pd.DataFrame(X_text.toarray()), pd.DataFrame(X_tags)], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Calculate precision and recall
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')

# Print the results
print(f'Precision: {precision}')
print(f'Recall: {recall}')


In [None]:
# Display unique values in the 'tips' column
print("Unique values in 'tips':", merged_data['tips'].unique())

# Display some sample 'tips' entries
print("Sample 'tips' entries:")
for tip in merged_data['tips'][:10]:
    print(tip)


In [None]:
# Import necessary libraries for KNN
from sklearn.neighbors import KNeighborsClassifier

# Create a KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)  # You can adjust the number of neighbors (n_neighbors) based on your preference

# Train the KNN classifier
knn.fit(X_train, y_train)

# Make predictions
y_pred_knn = knn.predict(X_test)

# Evaluate the KNN model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn, average='micro')
recall_knn = recall_score(y_test, y_pred_knn, average='micro')

# Print the results for KNN
print('Results for KNN:')
print(f'Accuracy: {accuracy_knn}')
print(f'Precision: {precision_knn}')
print(f'Recall: {recall_knn}')


In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

# Load your dataset
# Assume you have loaded NY_Restauraunts_checkins.csv, NY_Restauraunts_tips.csv, NY_Restauraunts_tags.csv
checkins_data = pd.read_csv('/kaggle/input/NY_Restauraunts_checkins.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)
tips_data = pd.read_csv('/kaggle/input/NY_Restauraunts_tips.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)
tags_data = pd.read_csv('/kaggle/input/NY_Restauraunts_tags.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)

# Merge dataframes based on common columns
# Display column names before merging
print("Columns in checkins_data:", checkins_data.columns)
print("Columns in tips_data:", tips_data.columns)

# Merge dataframes based on common columns
merged_data = pd.merge(checkins_data, tips_data, on=['user_ID', 'venue_ID'], how='inner')

# Display column names after merging
print("Columns in merged_data:", merged_data.columns)

# Check the column names in merged_data
print(merged_data.columns)

# Ensure 'tags' column is present in merged_data before proceeding
if 'tips' not in merged_data.columns:
    raise KeyError("'tags' column not found in merged_data")

# Convert 'user_ID' and 'venue_ID' columns to strings
merged_data['user_ID'] = merged_data['user_ID'].astype(str)
merged_data['venue_ID'] = merged_data['venue_ID'].astype(str)

# Concatenate 'user_ID' and 'venue_ID' with an underscore
y = merged_data['user_ID'] + '_' + merged_data['venue_ID']

# Convert 'venue_ID' column to strings
merged_data['venue_ID'] = merged_data['venue_ID'].astype(str)

# Handle NaN values in 'tags' column
merged_data['tips'] = merged_data['tips'].fillna('')  # Replace NaN with an empty string

# Feature extraction
vectorizer = TfidfVectorizer()
mlb = MultiLabelBinarizer()

X_text = vectorizer.fit_transform(merged_data['venue_ID'])
X_tags = mlb.fit_transform(merged_data['tips'].apply(lambda x: x.split(',')))

# Concatenate text features and tag features
X = pd.concat([pd.DataFrame(X_text.toarray()), pd.DataFrame(X_tags)], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Support Vector Machine (SVM) classifier
clf = SVC(kernel='linear', C=1.0, random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')

# Print the results
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')


In [None]:
# ...

# Train a Support Vector Machine (SVM) classifier
clf = SVC(kernel='linear', C=1.0, random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision_micro = precision_score(y_test, y_pred, average='micro')
recall_micro = recall_score(y_test, y_pred, average='micro')
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')
precision_weighted = precision_score(y_test, y_pred, average='weighted')
recall_weighted = recall_score(y_test, y_pred, average='weighted')

# Print the results
print(f'Accuracy: {accuracy}')
print(f'Precision (Micro): {precision_micro}')
print(f'Recall (Micro): {recall_micro}')
print(f'Precision (Macro): {precision_macro}')
print(f'Recall (Macro): {recall_macro}')
print(f'Precision (Weighted): {precision_weighted}')
print(f'Recall (Weighted): {recall_weighted}')


In [None]:
import numpy as np

def top_k_accuracy_score(y_true, y_score, k=5):
    top_k_predictions = np.argsort(y_score, axis=1)[:, -k:]
    true_in_top_k = np.any(top_k_predictions == y_true.reshape(-1, 1), axis=1)
    top_k_accuracy = np.mean(true_in_top_k)
    return top_k_accuracy

# Usage
k = 5
top_k_accuracy = top_k_accuracy_score(y_test, y_pred, k=k)
print(f'Top-{k} Accuracy: {top_k_accuracy}')


In [None]:
from sklearn.metrics import precision_score, recall_score

# Assuming you have trained your classifier and loaded your test data (X_test)
# ...

# List of similarity metrics
similarity_metrics = ['Random Link', 'Distance', 'Common Neighbor (User)', 
                      'Common Neighbor (Venue)', 'Preferential Attachment', 
                      'Adamic-Adar (User)', 'Adamic-Adar (Venue)', 'Katz']

# Results containers
precision_values = []
recall_values = []

# Loop through each similarity metric
for metric in similarity_metrics:
    # Check if the metric is present in any part of the venue IDs
    y_test_metric = y_test.apply(lambda x: metric in x)
    
    # If at least one instance of the metric is present, calculate precision and recall
    if y_test_metric.any():
        y_pred_metric = y_pred[y_test_metric.index]

        # Calculate precision and recall based on your predictions and ground truth
        precision = precision_score(y_test_metric, y_pred_metric, average='binary')
        recall = recall_score(y_test_metric, y_pred_metric, average='binary')
    else:
        # If the metric is not present in any venue ID, set precision and recall to 0
        precision, recall = 0.0, 0.0

    # Append results to the lists
    precision_values.append(precision)
    recall_values.append(recall)

# Create a DataFrame to display the results
results_df = pd.DataFrame({
    'Similarity Metric': similarity_metrics,
    'Precision': precision_values,
    'Recall': recall_values
})

# Print the results
print(results_df)


In [None]:
import numpy as np

# Print the unique values in y_test
print("Unique values in y_test:", len(np.unique(y_test)))

# Print the unique values in y_pred
print("Unique values in y_pred:", len(np.unique(y_pred)))


In [None]:
# Example: Comparison for 'Random Link'
metric = 'Random Link'
precision = precision_score(y_test == metric, y_pred == metric, average='binary')
recall = recall_score(y_test == metric, y_pred == metric, average='binary')
print(f'Precision for {metric}: {precision}')
print(f'Recall for {metric}: {recall}')


In [None]:
# ...

# Replace the following placeholders with your actual link prediction algorithms

# Random Link
def random_link_prediction(X_test):
    # Implement or use a function to predict links using the Random Link algorithm
    # Replace 'random_predictions' with the actual predictions
    random_predictions = ...
    return random_predictions

# Distance
def distance_prediction(X_test):
    # Implement or use a function to predict links using the Distance algorithm
    # Replace 'distance_predictions' with the actual predictions
    distance_predictions = ...
    return distance_predictions

# Common Neighbor (User)
def common_neighbor_user_prediction(X_test):
    # Implement or use a function to predict links using the Common Neighbor (User) algorithm
    # Replace 'common_neighbor_user_predictions' with the actual predictions
    common_neighbor_user_predictions = ...
    return common_neighbor_user_predictions

# Common Neighbor (Venue)
def common_neighbor_venue_prediction(X_test):
    # Implement or use a function to predict links using the Common Neighbor (Venue) algorithm
    # Replace 'common_neighbor_venue_predictions' with the actual predictions
    common_neighbor_venue_predictions = ...
    return common_neighbor_venue_predictions

# Preferential Attachment
def preferential_attachment_prediction(X_test):
    # Implement or use a function to predict links using the Preferential Attachment algorithm
    # Replace 'preferential_attachment_predictions' with the actual predictions
    preferential_attachment_predictions = ...
    return preferential_attachment_predictions

# Adamic-Adar (User)
def adamic_adar_user_prediction(X_test):
    # Implement or use a function to predict links using the Adamic-Adar (User) algorithm
    # Replace 'adamic_adar_user_predictions' with the actual predictions
    adamic_adar_user_predictions = ...
    return adamic_adar_user_predictions

# Adamic-Adar (Venue)
def adamic_adar_venue_prediction(X_test):
    # Implement or use a function to predict links using the Adamic-Adar (Venue) algorithm
    # Replace 'adamic_adar_venue_predictions' with the actual predictions
    adamic_adar_venue_predictions = ...
    return adamic_adar_venue_predictions

# Katz
def katz_prediction(X_test):
    # Implement or use a function to predict links using the Katz algorithm
    # Replace 'katz_predictions' with the actual predictions
    katz_predictions = ...
    return katz_predictions

# ...

# Calculate precision and recall for each link prediction algorithm

precision_micro_random = precision_score(y_test, random_link_prediction(X_test), average='micro')
recall_micro_random = recall_score(y_test, random_link_prediction(X_test), average='micro')
precision_macro_random = precision_score(y_test, random_link_prediction(X_test), average='macro')
recall_macro_random = recall_score(y_test, random_link_prediction(X_test), average='macro')
precision_weighted_random = precision_score(y_test, random_link_prediction(X_test), average='weighted')
recall_weighted_random = recall_score(y_test, random_link_prediction(X_test), average='weighted')

# Repeat the above code for each link prediction algorithm
# ...

# Print results for each link prediction algorithm

print("\nRandom Link Results:")
print(f'Precision (Micro): {precision_micro_random}')
print(f'Recall (Micro): {recall_micro_random}')
print(f'Precision (Macro): {precision_macro_random}')
print(f'Recall (Macro): {recall_macro_random}')
print(f'Precision (Weighted): {precision_weighted_random}')
print(f'Recall (Weighted): {recall_weighted_random}')

# Repeat the above print statements for each link prediction algorithm
# ...

# ...


In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd

# Load datasets
checkins_df = pd.read_csv('/kaggle/input/NY_Restauraunts_checkins.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)
tips_df = pd.read_csv('/kaggle/input/NY_Restauraunts_tips.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)
tags_df = pd.read_csv('/kaggle/input/NY_Restauraunts_tags.csv', delimiter=',', encoding='ISO-8859-1', nrows=nRowsRead)

# Create a graph
# G = nx.Graph()

# # Add edges from check-ins
# checkin_edges = [(row['user_ID'], row['venue_ID']) for index, row in checkins_df.iterrows()]
# G.add_edges_from(checkin_edges)

# # Add edges from tips (optional, depending on your analysis)
# tip_edges = [(row['user_ID'], row['venue_ID']) for index, row in tips_df.iterrows()]
# G.add_edges_from(tip_edges)

# # Separate users and venues for plotting
# user_nodes = {n for n, d in G.nodes(data=True) if 'user_ID' in d}
# venue_nodes = set(G) - user_nodes
# Create a bipartite graph
G = nx.Graph()

# Add user nodes with 'user_id' attribute
G.add_nodes_from([(str(user), {'user_ID': str(user)}) for user in checkins_df['user_ID']])

# Add venue nodes with 'venue_id' attribute
G.add_nodes_from([(str(venue), {'venue_ID': str(venue)}) for venue in checkins_df['venue_ID']])

# Add edges from check-ins
checkin_edges = [(str(row['user_ID']), str(row['venue_ID'])) for _, row in checkins_df.iterrows()]
G.add_edges_from(checkin_edges)

# Separate users and venues for scoring
user_nodes = {n for n, d in G.nodes(data=True) if 'user_ID' in d}
venue_nodes = set(G) - user_nodes


# Plot bipartite graph
pos = {node: (0, i) for i, node in enumerate(user_nodes)}
pos.update({node: (1, i) for i, node in enumerate(venue_nodes)})
nx.draw(G, pos=pos, font_weight='bold')
plt.title('Bipartite Graph of Users and Venues')
plt.show()

# Plot degree distribution
degree_sequence = [d for n, d in G.degree()]
plt.hist(degree_sequence, bins=20, alpha=0.75)
plt.title("Degree Distribution")
plt.xlabel("Degree")
plt.ylabel("Count")
plt.show()

# Calculate node centralities
centrality = nx.degree_centrality(G)

# Plot nodes with sizes proportional to centrality
node_sizes = [centrality[node] * 500 for node in G.nodes()]
nx.draw(G, with_labels=True, node_size=node_sizes, cmap=plt.cm.Blues, font_weight='bold')
plt.title('Network Centrality Visualization')
plt.show()

# Detect communities using Louvain algorithm
partition = community.best_partition(G)

# Plot the graph with communities in different colors
pos = nx.spring_layout(G)  # You can use other layouts
cmap = plt.cm.get_cmap('viridis', max(partition.values()) + 1)
nx.draw_networkx_nodes(G, pos, node_size=50, cmap=cmap, node_color=list(partition.values()))
nx.draw_networkx_edges(G, pos, alpha=0.5)
plt.title('Community Detection Visualization')
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Assuming you have loaded the datasets
# checkins_df = pd.read_csv('NY_Restaurants_checkins.csv', header=None, names=['user_ID', 'venue_ID'])
# tips_df = pd.read_csv('NY_Restaurants_tips.csv', header=None, names=['user_ID', 'venue_ID', 'tip_text'])
# tags_df = pd.read_csv('NY_Restaurants_tags.csv', header=None, names=['venue_ID', 'tag_set'])

# Create a bipartite graph
B = nx.Graph()

# Add nodes and edges for check-ins
B.add_nodes_from(checkins_df['user_ID'].unique(), bipartite=0, label='user')
B.add_nodes_from(checkins_df['venue_ID'].unique(), bipartite=1, label='venue')
B.add_edges_from(zip(checkins_df['user_ID'], checkins_df['venue_ID']))

# Add nodes and edges for tips
B.add_nodes_from(tips_df['user_ID'].unique(), bipartite=0, label='user')
B.add_nodes_from(tips_df['venue_ID'].unique(), bipartite=1, label='venue')
B.add_edges_from(zip(tips_df['user_ID'], tips_df['venue_ID']))

# Add nodes for tags
B.add_nodes_from(tags_df['venue_ID'].unique(), bipartite=1, label='venue')

# Separate nodes by bipartite attribute for coloring
user_nodes = [node for node, data in B.nodes(data=True) if data['bipartite'] == 0]
venue_nodes = [node for node, data in B.nodes(data=True) if data['bipartite'] == 1]

# Draw the bipartite graph with different colors for users and venues
pos = nx.spring_layout(B)
nx.draw_networkx_nodes(B, pos, nodelist=user_nodes, node_color='skyblue', node_size=50, label='Users')
nx.draw_networkx_nodes(B, pos, nodelist=venue_nodes, node_color='salmon', node_size=50, label='Venues')
nx.draw_networkx_edges(B, pos, width=1.0, alpha=0.5)
# nx.draw_networkx_labels(B, pos)
plt.title('Bipartite Graph of NYC Restaurants')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Assuming you have loaded the datasets
# checkins_df = pd.read_csv('NY_Restaurants_checkins.csv', header=None, names=['user_ID', 'venue_ID'])
# tips_df = pd.read_csv('NY_Restaurants_tips.csv', header=None, names=['user_ID', 'venue_ID', 'tip_text'])
# tags_df = pd.read_csv('NY_Restaurants_tags.csv', header=None, names=['venue_ID', 'tag_set'])

# Create a bipartite graph
B = nx.Graph()

# Add nodes and edges for check-ins
B.add_nodes_from(checkins_df['user_ID'].unique(), bipartite=0, label='user')
B.add_nodes_from(checkins_df['venue_ID'].unique(), bipartite=1, label='venue')
B.add_edges_from(zip(checkins_df['user_ID'], checkins_df['venue_ID']))

# Add nodes and edges for tips
B.add_nodes_from(tips_df['user_ID'].unique(), bipartite=0, label='user')
B.add_nodes_from(tips_df['venue_ID'].unique(), bipartite=1, label='venue')
B.add_edges_from(zip(tips_df['user_ID'], tips_df['venue_ID']))

# Add nodes for tags
B.add_nodes_from(tags_df['venue_ID'].unique(), bipartite=1, label='venue')

# Manually set positions for the bipartite graph
user_nodes = [node for node, data in B.nodes(data=True) if data['bipartite'] == 0]
venue_nodes = [node for node, data in B.nodes(data=True) if data['bipartite'] == 1]

pos = {}
pos.update((node, (1, index)) for index, node in enumerate(user_nodes))  # Users on the left
pos.update((node, (2, index)) for index, node in enumerate(venue_nodes))  # Venues on the right

# Draw the bipartite graph with different colors for users and venues
nx.draw_networkx_nodes(B, pos, nodelist=user_nodes, node_color='skyblue', node_size=50, label='Users')
nx.draw_networkx_nodes(B, pos, nodelist=venue_nodes, node_color='salmon', node_size=50, label='Venues')
nx.draw_networkx_edges(B, pos, width=1.0, alpha=0.5)
# nx.draw_networkx_labels(B, pos)
plt.title('Bipartite Graph of NYC Restaurants')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import community  # Louvain community detection

# Assuming you have loaded the datasets and created the bipartite graph 'B'
# checkins_df = pd.read_csv('NY_Restaurants_checkins.csv', header=None, names=['user_ID', 'venue_ID'])
# tips_df = pd.read_csv('NY_Restaurants_tips.csv', header=None, names=['user_ID', 'venue_ID', 'tip_text'])
# tags_df = pd.read_csv('NY_Restaurants_tags.csv', header=None, names=['venue_ID', 'tag_set'])

# Create a bipartite graph
# B = nx.Graph()

# # Add nodes and edges for check-ins
# B.add_nodes_from(checkins_df['user_ID'].unique(), bipartite=0, label='user')
# B.add_nodes_from(checkins_df['venue_ID'].unique(), bipartite=1, label='venue')
# B.add_edges_from(zip(checkins_df['user_ID'], checkins_df['venue_ID']))

# # Add nodes and edges for tips
# B.add_nodes_from(tips_df['user_ID'].unique(), bipartite=0, label='user')
# B.add_nodes_from(tips_df['venue_ID'].unique(), bipartite=1, label='venue')
# B.add_edges_from(zip(tips_df['user_ID'], tips_df['venue_ID']))

# # Add nodes for tags
# B.add_nodes_from(tags_df['venue_ID'].unique(), bipartite=1, label='venue')

# Plot degree distribution
degree_sequence = [d for n, d in B.degree()]
plt.hist(degree_sequence, bins=20, alpha=0.7)
plt.title('Degree Distribution')
plt.xlabel('Degree')
plt.ylabel('Frequency')
plt.show()

# Calculate node centralities
degree_centrality = nx.degree_centrality(B)
closeness_centrality = nx.closeness_centrality(B)

# Plot nodes with sizes proportional to centrality
node_sizes = [3000 * degree_centrality[node] for node in B.nodes]
pos = nx.spring_layout(B)
nx.draw(B, pos, font_weight='bold', node_color='skyblue', node_size=node_sizes)
plt.title('Graph with Node Sizes Proportional to Degree Centrality')
plt.show()

# Detect communities using Louvain algorithm
partition = community.best_partition(B)

# Plot the graph with communities in different colors
pos = nx.spring_layout(B)
cmap = plt.get_cmap('viridis')
colors = [cmap(partition[node]) for node in B.nodes]
nx.draw(B, pos, node_color=colors, cmap=cmap)
plt.title('Graph with Communities Detected by Louvain Algorithm')
plt.show()


In [None]:
# Calculate node centralities
degree_centrality = nx.degree_centrality(B)
closeness_centrality = nx.closeness_centrality(B)

# print(degree_centrality,closeness_centrality )

In [None]:
len(venue_nodes)

In [None]:
len(user_nodes)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

degree_sequence = [d for n, d in B.degree()]

# Count the occurrences of each degree
degree_counts = dict(zip(*np.unique(degree_sequence, return_counts=True)))

# Sort the degrees for plotting
sorted_degrees = sorted(degree_counts.keys())

# Plot the degree distribution in log-log scale using a line graph
plt.plot(sorted_degrees, [degree_counts[degree] for degree in sorted_degrees], linestyle='-', markersize=8)

plt.xscale('log')
plt.yscale('log')

plt.title('Degree Distribution (Log-Log Scale)')
plt.xlabel('Degree')
plt.ylabel('Frequency')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Calculate degree distribution for users and venues
user_degree_sequence = [B.degree(node) for node in user_nodes]
venue_degree_sequence = [B.degree(node) for node in venue_nodes]

# Count the occurrences of each degree for users
user_degree_counts = dict(zip(*np.unique(user_degree_sequence, return_counts=True)))

# Count the occurrences of each degree for venues
venue_degree_counts = dict(zip(*np.unique(venue_degree_sequence, return_counts=True)))

# Sort the degrees for plotting
sorted_user_degrees, sorted_user_counts = zip(*sorted(user_degree_counts.items()))
sorted_venue_degrees, sorted_venue_counts = zip(*sorted(venue_degree_counts.items()))

# Plot the degree distribution for users in log-log scale using a line graph
plt.plot(sorted_user_degrees, sorted_user_counts, marker='o', linestyle='-', markersize=8, label='Users')

# Plot the degree distribution for venues in log-log scale using a line graph
plt.plot(sorted_venue_degrees, sorted_venue_counts, marker='o', linestyle='-', markersize=8, label='Venues')

plt.xscale('log')
plt.yscale('log')

plt.title('Degree Distribution of Users and Venues (Log-Log Scale)')
plt.xlabel('Degree')
plt.ylabel('Frequency')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
# Assuming you have loaded the datasets
# checkins_df = pd.read_csv('NY_Restaurants_checkins.csv', header=None, names=['user_ID', 'venue_ID'])
# tips_df = pd.read_csv('NY_Restaurants_tips.csv', header=None, names=['user_ID', 'venue_ID', 'tip_text'])
# tags_df = pd.read_csv('NY_Restaurants_tags.csv', header=None, names=['venue_ID', 'tag_set'])

# Create a bipartite graph
B = nx.Graph()

# Add nodes and edges for check-ins
B.add_nodes_from(checkins_df['user_ID'].unique(), bipartite=0, label='user')
B.add_nodes_from(checkins_df['venue_ID'].unique(), bipartite=1, label='venue')
B.add_edges_from(zip(checkins_df['user_ID'], checkins_df['venue_ID']))

# Add nodes and edges for tips
B.add_nodes_from(tips_df['user_ID'].unique(), bipartite=0, label='user')
B.add_nodes_from(tips_df['venue_ID'].unique(), bipartite=1, label='venue')
B.add_edges_from(zip(tips_df['user_ID'], tips_df['venue_ID']))

# Add nodes for tags
B.add_nodes_from(tags_df['venue_ID'].unique(), bipartite=1, label='venue')

# Manually set positions for the bipartite graph
user_nodes = [node for node, data in B.nodes(data=True) if data['bipartite'] == 0]
venue_nodes = [node for node, data in B.nodes(data=True) if data['bipartite'] == 1]

pos = {}
pos.update((node, (1, index)) for index, node in enumerate(user_nodes))  # Users on the left
pos.update((node, (2, index)) for index, node in enumerate(venue_nodes))  # Venues on the right

# Draw the bipartite graph with different colors for users and venues
nx.draw_networkx_nodes(B, pos, nodelist=user_nodes, node_color='skyblue', node_size=50, label='Users')
nx.draw_networkx_nodes(B, pos, nodelist=venue_nodes, node_color='salmon', node_size=50, label='Venues')
nx.draw_networkx_edges(B, pos, width=1.0, alpha=0.5)
# nx.draw_networkx_labels(B, pos)
plt.title('Bipartite Graph of NYC Restaurants')
plt.legend()
plt.show()

# Calculate degree sequences
user_degree_sequence = [B.degree(node) for node in user_nodes]
venue_degree_sequence = [B.degree(node) for node in venue_nodes]

# Sort the degrees for plotting
sorted_user_degrees, sorted_user_counts = zip(*sorted(Counter(user_degree_sequence).items()))
sorted_venue_degrees, sorted_venue_counts = zip(*sorted(Counter(venue_degree_sequence).items()))

# Plot the degree distribution
plt.plot(sorted_user_degrees, sorted_user_counts, label='Users')
plt.plot(sorted_venue_degrees, sorted_venue_counts, label='Venues')
plt.xscale('log')
plt.yscale('log')
plt.title('Degree Distribution of Users and Venues')
plt.xlabel('Degree')
plt.ylabel('Frequency')
# plt.xlim(1,30)
plt.legend()
plt.show()


In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter

# Assuming you have loaded the datasets
# checkins_df = pd.read_csv('NY_Restaurants_checkins.csv', header=None, names=['user_ID', 'venue_ID'])
# tips_df = pd.read_csv('NY_Restaurants_tips.csv', header=None, names=['user_ID', 'venue_ID', 'tip_text'])
# tags_df = pd.read_csv('NY_Restaurants_tags.csv', header=None, names=['venue_ID', 'tag_set'])

# Create a bipartite graph
B = nx.Graph()

# Add nodes and edges for check-ins
B.add_nodes_from(checkins_df['user_ID'].unique(), bipartite=0, label='user')
B.add_nodes_from(checkins_df['venue_ID'].unique(), bipartite=1, label='venue')
B.add_edges_from(zip(checkins_df['user_ID'], checkins_df['venue_ID']))

# Add nodes and edges for tips
B.add_nodes_from(tips_df['user_ID'].unique(), bipartite=0, label='user')
B.add_nodes_from(tips_df['venue_ID'].unique(), bipartite=1, label='venue')
B.add_edges_from(zip(tips_df['user_ID'], tips_df['venue_ID']))

# Add nodes for tags
B.add_nodes_from(tags_df['venue_ID'].unique(), bipartite=1, label='venue')

# Separate users and venues
user_nodes = {n for n, d in B.nodes(data=True) if d['bipartite'] == 0}
venue_nodes = {n for n, d in B.nodes(data=True) if d['bipartite'] == 1}

# Calculate degrees
user_degrees = dict(B.degree(user_nodes))
venue_degrees = dict(B.degree(venue_nodes))

# Plot degree distribution for users
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(list(user_degrees.values()), bins=20, alpha=0.7)
plt.title('User Degree Distribution')
plt.xlabel('Degree')
plt.ylabel('Frequency')

# Plot degree distribution for venues
plt.subplot(1, 2, 2)
plt.hist(list(venue_degrees.values()), bins=20, alpha=0.7, color='salmon')
plt.title('Venue Degree Distribution')
plt.xlabel('Degree')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
user_degrees.values()

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

# Assuming you have loaded the datasets
# checkins_df = pd.read_csv('NY_Restaurants_checkins.csv', header=None, names=['user_ID', 'venue_ID'])
# tips_df = pd.read_csv('NY_Restaurants_tips.csv', header=None, names=['user_ID', 'venue_ID', 'tip_text'])
# tags_df = pd.read_csv('NY_Restaurants_tags.csv', header=None, names=['venue_ID', 'tag_set'])

# Create a bipartite graph
B = nx.Graph()

# Add nodes and edges for check-ins
B.add_nodes_from(checkins_df['user_ID'].unique(), bipartite=0, label='user')
B.add_nodes_from(checkins_df['venue_ID'].unique(), bipartite=1, label='venue')
B.add_edges_from(zip(checkins_df['user_ID'], checkins_df['venue_ID']))

# Add nodes and edges for tips
B.add_nodes_from(tips_df['user_ID'].unique(), bipartite=0, label='user')
B.add_nodes_from(tips_df['venue_ID'].unique(), bipartite=1, label='venue')
B.add_edges_from(zip(tips_df['user_ID'], tips_df['venue_ID']))

# Add nodes for tags
B.add_nodes_from(tags_df['venue_ID'].unique(), bipartite=1, label='venue')

# Set positions for the bipartite graph
pos = nx.bipartite_layout(B, B.nodes)

# Draw the bipartite graph with different colors for users and venues
plt.figure(figsize=(1, 6))
nx.draw(B, pos, node_color=['skyblue' if B.nodes[node]['bipartite'] == 0 else 'salmon' for node in B.nodes], node_size=5)
plt.title('Bipartite Graph of NYC Restaurants')
plt.show()


In [None]:
import community


In [None]:
# Detect communities using Louvain algorithm
partition = community.best_partition(G)

# Plot the graph with communities in different colors
pos = nx.spring_layout(G)  # You can use other layouts
cmap = plt.cm.get_cmap('viridis', max(partition.values()) + 1)
nx.draw_networkx_nodes(G, pos, node_size=30, cmap=cmap, node_color=list(partition.values()))
nx.draw_networkx_edges(G, pos, alpha=0.5)
plt.title('Community Detection Visualization')
plt.show()

In [None]:
pos = {node: (0, i) for i, node in enumerate(user_nodes)}
pos.update({node: (1, i) for i, node in enumerate(venue_nodes)})

plt.figure(figsize=(10, 6))  # Adjust the figure size for better visibility
nx.draw(G, pos=pos, node_color='skyblue', edge_color='gray', node_size=50)
plt.title('Bipartite Graph of Users and Venues')
plt.show()

In [None]:
import networkx as nx
import pandas as pd
import math
import community

# Scoring Methods

def common_neighbors_score(G, node1, node2):
    neighbors1 = set(G.neighbors(node1))
    neighbors2 = set(G.neighbors(node2))
    return len(neighbors1.intersection(neighbors2))

def jaccard_coefficient_score(G, node1, node2):
    neighbors1 = set(G.neighbors(node1))
    neighbors2 = set(G.neighbors(node2))
    intersection_size = len(neighbors1.intersection(neighbors2))
    union_size = len(neighbors1.union(neighbors2))
    return intersection_size / union_size if union_size != 0 else 0

def adamic_adar_index_score(G, node1, node2):
    common_neighbors = set(G.neighbors(node1)).intersection(G.neighbors(node2))
    return sum(1 / math.log(G.degree(neighbor)) for neighbor in common_neighbors if G.degree(neighbor) > 1)

def preferential_attachment_score(G, node1, node2):
    return G.degree(node1) * G.degree(node2)

# ...

# Check if there are nodes in the graph
if G.number_of_nodes() > 0:
    user_nodes = {n for n, d in G.nodes(data=True) if 'user_ID' in d}
    venue_nodes = set(G) - user_nodes

    # Calculate scores for all potential edges
    edge_scores = {}
    sample_user = list(user_nodes)[0] if user_nodes else None
    sample_venue = list(venue_nodes)[0] if venue_nodes else None

    if sample_user is not None and sample_venue is not None:
        for user in user_nodes:
            for venue in venue_nodes:
                if not G.has_edge(user, venue):
                    edge_scores[(user, venue)] = {
                        'Common Neighbors': common_neighbors_score(G, user, venue),
                        'Jaccard Coefficient': jaccard_coefficient_score(G, user, venue),
                        'Adamic-Adar Index': adamic_adar_index_score(G, user, venue),
                        'Preferential Attachment': preferential_attachment_score(G, user, venue)
                    }

        # Display scores for a specific edge
        print(f"Scores for edge ({sample_user}, {sample_venue}):")
        for method, score in edge_scores.get((sample_user, sample_venue), {}).items():
            print(f"{method}: {score}")

        # Community Detection
        # Check if the user is in the graph before accessing the community attribute
        if sample_user in G.nodes:
            sample_user_community = G.nodes[sample_user].get('community', 'Not assigned')
            print(f"Community of {sample_user}: {sample_user_community}")
        else:
            print(f"{sample_user} is not in the graph.")
    else:
        print("No users or venues in the graph.")
else:
    print("The graph is empty.")



# Community Detection

# Detect communities using Louvain algorithm
partition = community.best_partition(G)

# Assign community labels to nodes
nx.set_node_attributes(G, partition, 'community')

# Display community for a specific user
sample_user_community = G.nodes[sample_user]['community']
print(f"Community of {sample_user}: {sample_user_community}")


In [None]:
# Create a bipartite graph
G = nx.Graph()

# Add user nodes with 'user_id' attribute
G.add_nodes_from([(str(user), {'user_ID': str(user)}) for user in checkins_df['user_ID']])

# Add venue nodes with 'venue_id' attribute
G.add_nodes_from([(str(venue), {'venue_ID': str(venue)}) for venue in checkins_df['venue_ID']])

# Add edges from check-ins
checkin_edges = [(str(row['user_ID']), str(row['venue_ID'])) for _, row in checkins_df.iterrows()]
G.add_edges_from(checkin_edges)

# Separate users and venues for scoring
user_nodes = {n for n, d in G.nodes(data=True) if 'user_ID' in d}
venue_nodes = set(G) - user_nodes


In [None]:
import networkx as nx
import pandas as pd
import math
import community

# Scoring Methods

def common_neighbors_score(G, node1, node2):
    neighbors1 = set(G.neighbors(node1))
    neighbors2 = set(G.neighbors(node2))
    return len(neighbors1.intersection(neighbors2))

def jaccard_coefficient_score(G, node1, node2):
    neighbors1 = set(G.neighbors(node1))
    neighbors2 = set(G.neighbors(node2))
    intersection_size = len(neighbors1.intersection(neighbors2))
    union_size = len(neighbors1.union(neighbors2))
    return intersection_size / union_size if union_size != 0 else 0

def adamic_adar_index_score(G, node1, node2):
    common_neighbors = set(G.neighbors(node1)).intersection(G.neighbors(node2))
    return sum(1 / math.log(G.degree(neighbor)) for neighbor in common_neighbors if G.degree(neighbor) > 1)

def preferential_attachment_score(G, node1, node2):
    return G.degree(node1) * G.degree(node2)

# ...

# Check if there are nodes in the graph
if G.number_of_nodes() > 0:
    user_nodes = {n for n, d in G.nodes(data=True) if 'user_ID' in d}
    venue_nodes = set(G) - user_nodes

    # Calculate scores for all potential edges
    edge_scores = {}
    sample_user = list(user_nodes)[0] if user_nodes else None
    sample_venue = list(venue_nodes)[0] if venue_nodes else None

    if sample_user is not None and sample_venue is not None:
        for user in user_nodes:
            for venue in venue_nodes:
                if not G.has_edge(user, venue):
                    edge_scores[(user, venue)] = {
                        'Common Neighbors': common_neighbors_score(G, user, venue),
                        'Jaccard Coefficient': jaccard_coefficient_score(G, user, venue),
                        'Adamic-Adar Index': adamic_adar_index_score(G, user, venue),
                        'Preferential Attachment': preferential_attachment_score(G, user, venue)
                    }

        # Display scores for a specific edge
        print(f"Scores for edge ({sample_user}, {sample_venue}):")
        for method, score in edge_scores.get((sample_user, sample_venue), {}).items():
            print(f"{method}: {score}")

        # Community Detection
        # Check if the user is in the graph before accessing the community attribute
        if sample_user in G.nodes:
            sample_user_community = G.nodes[sample_user].get('community', 'Not assigned')
            print(f"Community of {sample_user}: {sample_user_community}")
        else:
            print(f"{sample_user} is not in the graph.")
    else:
        print("No users or venues in the graph.")
else:
    print("The graph is empty.")



# Community Detection

# Detect communities using Louvain algorithm
partition = community.best_partition(G)

# Assign community labels to nodes
nx.set_node_attributes(G, partition, 'community')

# Display community for a specific user
sample_user_community = G.nodes[sample_user]['community']
print(f"Community of {sample_user}: {sample_user_community}")


In [None]:
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

# Assuming G is the bipartite graph you created earlier

# Split the edges into training and test sets
edges = list(G.edges())
edges_train, edges_test = train_test_split(edges, test_size=0.2, random_state=42)

# Function to predict links using a similarity metric
def predict_links(graph, similarity_function, edges_to_predict):
    predictions = []
    for edge in edges_to_predict:
        user, venue = edge
        score = similarity_function(graph, user, venue)
        predictions.append((user, venue, score))
    return predictions

# Example similarity function (replace with your actual functions)
def common_neighbors_score(graph, user, venue):
    user_neighbors = set(graph.neighbors(user))
    venue_neighbors = set(graph.neighbors(venue))
    common_neighbors = user_neighbors.intersection(venue_neighbors)
    return len(common_neighbors)

# Calculate precision and recall for a given similarity metric
def calculate_precision_recall(graph, similarity_function, edges_train, edges_test):
    # Train the model on the training set
    # (For simplicity, we're using common_neighbors_score as an example; replace with your chosen function)
    model_predictions = predict_links(graph, common_neighbors_score, edges_train)

    # Extract true labels for the test set
    true_labels = [1 if edge in edges_test else 0 for edge in model_predictions]

    # Check if there are any positive predictions
    if sum(true_labels) == 0:
        return 0.0, 0.0  # Precision and recall are both 0 if there are no positive predictions

    # Extract predicted scores for the test set
    predicted_scores = [score for _, _, score in model_predictions]

    # Set a threshold to convert scores into binary predictions
    threshold = 0.5
    binary_predictions = [1 if score > threshold else 0 for score in predicted_scores]

    # Calculate precision and recall
    precision = precision_score(true_labels, binary_predictions)
    recall = recall_score(true_labels, binary_predictions)

    return precision, recall

# Example usage
precision, recall = calculate_precision_recall(G, common_neighbors_score, edges_train, edges_test)

# Print the results
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")


In [None]:
import networkx as nx
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

# Function to add edges to the bipartite graph
def add_edges_from_df(graph, df, source_col, target_col):
    edges = [(str(row[source_col]), str(row[target_col])) for _, row in df.iterrows()]
    graph.add_edges_from(edges)

# Function to create a bipartite graph from the given datasets
def create_bipartite_graph(checkins_df, user_col, venue_col):
    G = nx.Graph()
    add_edges_from_df(G, checkins_df, user_col, venue_col)
    return G

# Scoring methods
def distance_score(graph, user, venue):
    try:
        return nx.shortest_path_length(graph, source=user, target=venue)
    except nx.NetworkXNoPath:
        return float('inf')

def common_neighbors_score_user(graph, user, venue):
    user_neighbors = set(graph.neighbors(user))
    venue_neighbors = set(graph.neighbors(venue))
    common_neighbors = user_neighbors.intersection(venue_neighbors)
    return len(common_neighbors)

def common_neighbors_score_venue(graph, user, venue):
    user_neighbors = set(graph.neighbors(user))
    venue_neighbors = set(graph.neighbors(venue))
    common_neighbors = user_neighbors.intersection(venue_neighbors)
    return len(common_neighbors)

def preferential_attachment_score(graph, user, venue):
    return len(graph.neighbors(user)) * len(graph.neighbors(venue))

def adamic_adar_user_score(graph, user, venue):
    common_neighbors = set(graph.neighbors(user)).intersection(set(graph.neighbors(venue)))
    score = sum(1 / (1 + len(graph.neighbors(neighbor))) for neighbor in common_neighbors)
    return score

def adamic_adar_venue_score(graph, user, venue):
    common_neighbors = set(graph.neighbors(user)).intersection(set(graph.neighbors(venue)))
    score = sum(1 / (1 + len(graph.neighbors(neighbor))) for neighbor in common_neighbors)
    return score

def katz_score(graph, user, venue):
    beta = 0.005
    max_path_length = 5
    paths = nx.all_simple_paths(graph, source=user, target=venue, cutoff=max_path_length)
    return sum(beta**len(path) for path in paths)

# Function to predict links using a given scoring method
def predict_links(graph, scoring_function, edges_to_predict):
    predictions = []
    for edge in edges_to_predict:
        user, venue = edge
        score = scoring_function(graph, user, venue)
        predictions.append((user, venue, score))
    return predictions

# Function to evaluate precision and recall for a given scoring method
def evaluate_scoring_method(graph, scoring_function, edges_train, edges_test):
    # Train the model on the training set
    model_predictions = predict_links(graph, scoring_function, edges_train)

    # Extract true labels for the test set
    true_labels = [1 if edge in edges_test else 0 for edge in model_predictions]

    # Check if there are any positive predictions
    if sum(true_labels) == 0:
        return 0.0, 0.0  # Precision and recall are both 0 if there are no positive predictions

    # Extract predicted scores for the test set
    predicted_scores = [score for _, _, score in model_predictions]

    # Set a threshold to convert scores into binary predictions
    threshold = 0.5
    binary_predictions = [1 if score > threshold else 0 for score in predicted_scores]

    # Calculate precision and recall
    precision = precision_score(true_labels, binary_predictions)
    recall = recall_score(true_labels, binary_predictions)

    return precision, recall

# Example usage
# Assuming you have CSV files 'checkins.csv' with columns 'user_id' and 'venue_id'
# checkins_df = pd.read_csv('checkins.csv')

# Split the edges into training and test sets
edges = list(G.edges())
edges_train, edges_test = train_test_split(edges, test_size=0.1, random_state=50)

# Create a bipartite graph
G = create_bipartite_graph(checkins_df, 'user_ID', 'venue_ID')

# Example usage of the scoring methods
precision, recall = evaluate_scoring_method(G, distance_score, edges_train, edges_test)
print(f"Precision: {precision * 100:.5f}%")
print(f"Recall: {recall * 100:.5f}%")

# Repeat for other scoring methods
# precision, recall = evaluate_scoring_method(G, common_neighbors_score_user, edges_train, edges_test)
# precision, recall = evaluate_scoring_method(G, common_neighbors_score_venue, edges_train, edges_test)
# precision, recall = evaluate_scoring_method(G, preferential_attachment_score, edges_train, edges_test)
# precision, recall = evaluate_scoring_method(G, adamic_adar_user_score, edges_train, edges_test)
# precision, recall = evaluate_scoring_method(G, adamic_adar_venue_score, edges_train, edges_test)
# precision, recall = evaluate_scoring_method(G, katz_score, edges_train, edges_test)


In [None]:
len(edges)

In [None]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

# Function to add edges to the bipartite graph
def add_edges_from_df(graph, df, source_col, target_col):
    edges = [(str(row[source_col]), str(row[target_col])) for _, row in df.iterrows()]
    graph.add_edges_from(edges)

# Function to create a bipartite graph from the given datasets
def create_bipartite_graph(checkins_df, user_col, venue_col):
    G = nx.Graph()
    add_edges_from_df(G, checkins_df, user_col, venue_col)
    return G

# Scoring methods
# ... (same as before)

# Function to predict links using a given scoring method
def predict_links(graph, scoring_function, edges_to_predict):
    predictions = []
    for edge in edges_to_predict:
        user, venue = edge
        score = scoring_function(graph, user, venue)
        predictions.append((user, venue, score))
    return predictions

# Function to evaluate precision and recall for a given scoring method
def evaluate_scoring_method(graph, scoring_function, edges_train, edges_test, visualize=False):
    # Train the model on the training set
    model_predictions = predict_links(graph, scoring_function, edges_train)

    # Extract true labels for the test set
    true_labels = [1 if edge in edges_test else 0 for edge in model_predictions]

    # Extract predicted scores for the test set
    predicted_scores = [score for _, _, score in model_predictions]

    # Visualize the score distribution
    if visualize:
        plt.hist(predicted_scores, bins=20, edgecolor='black')
        plt.xlabel('Score')
        plt.ylabel('Frequency')
        plt.title('Score Distribution')
        plt.show()

    # Set a threshold to convert scores into binary predictions
    threshold = 0.5
    binary_predictions = [1 if score > threshold else 0 for score in predicted_scores]

    # Calculate precision and recall
    precision = precision_score(true_labels, binary_predictions)
    recall = recall_score(true_labels, binary_predictions)

    return precision, recall

# Example usage
# Assuming you have CSV files 'checkins.csv' with columns 'user_id' and 'venue_id'
# checkins_df = pd.read_csv('checkins.csv')

# Split the edges into training and test sets
edges = list(G.edges())
edges_train, edges_test = train_test_split(edges, test_size=0.2, random_state=42)

# Create a bipartite graph
G = create_bipartite_graph(checkins_df, 'user_ID', 'venue_ID')

# Example usage of the scoring methods with visualization
precision, recall = evaluate_scoring_method(G, distance_score, edges_train, edges_test, visualize=True)
print(f"Precision: {precision * 100:.5f}%")
print(f"Recall: {recall * 100:.5f}%")


In [None]:
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

# Assuming you have loaded the datasets
# checkins_df = pd.read_csv('NY_Restaurants_checkins.csv', header=None, names=['user_ID', 'venue_ID'])
# tips_df = pd.read_csv('NY_Restaurants_tips.csv', header=None, names=['user_ID', 'venue_ID', 'tip_text'])
# tags_df = pd.read_csv('NY_Restaurants_tags.csv', header=None, names=['venue_ID', 'tag_set'])

# # Create a bipartite graph
# B = nx.Graph()

# # Add nodes and edges for check-ins
# B.add_nodes_from(checkins_df['user_ID'].unique(), bipartite=0, label='user')
# B.add_nodes_from(checkins_df['venue_ID'].unique(), bipartite=1, label='venue')
# B.add_edges_from(zip(checkins_df['user_ID'], checkins_df['venue_ID']))

# # Add nodes and edges for tips
# B.add_nodes_from(tips_df['user_ID'].unique(), bipartite=0, label='user')
# B.add_nodes_from(tips_df['venue_ID'].unique(), bipartite=1, label='venue')
# B.add_edges_from(zip(tips_df['user_ID'], tips_df['venue_ID']))

# # Add nodes for tags
# B.add_nodes_from(tags_df['venue_ID'].unique(), bipartite=1, label='venue')

# Define a function to calculate Jaccard similarity between neighbors
def jaccard_similarity(G, node1, node2, bipartite_attribute='bipartite'):
    neighbors1 = set(neigh for neigh in G.neighbors(node1) if G.nodes[neigh][bipartite_attribute] != G.nodes[node1][bipartite_attribute])
    neighbors2 = set(neigh for neigh in G.neighbors(node2) if G.nodes[neigh][bipartite_attribute] != G.nodes[node2][bipartite_attribute])
    
    intersection = len(neighbors1.intersection(neighbors2))
    union = len(neighbors1.union(neighbors2))
    
    return intersection / union if union != 0 else 0

# Generate train and test sets
train_nodes, test_nodes = train_test_split(list(B.nodes()), test_size=0.2, random_state=42)

# Calculate Jaccard similarity for train set
jaccard_similarities_train = [jaccard_similarity(B, node1, node2) for node1 in train_nodes for node2 in train_nodes]

# Define a threshold for predicting edges in the test set
threshold = 0.01  # Adjust as needed

# Predict edges in the test set based on Jaccard similarity
predicted_edges = [(node1, node2) for node1 in test_nodes for node2 in train_nodes
                    if jaccard_similarity(B, node1, node2) > 0]
#                    if jaccard_similarity(B, node1, node2) > threshold]

# Calculate precision and recall
true_positive = len(set(predicted_edges).intersection(B.edges()))
false_positive = len(set(predicted_edges) - set(B.edges()))
false_negative = len(set(B.edges()) - set(predicted_edges))

precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) != 0 else 0
recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) != 0 else 0

print(f'Precision: {precision:.5f}')
print(f'Recall: {recall:.5f}')


In [None]:
def negative_shortest_distance_score(G, source, target):
    """
    Calculate the negative shortest distance score for an edge.
    """
    try:
        distance = nx.shortest_path_length(G, source=source, target=target)
        return -distance
    except nx.NetworkXNoPath:
        return float('-inf')  # Return negative infinity if there is no path

# Generate train and test sets
train_nodes, test_nodes = train_test_split(list(B.nodes()), test_size=0.2, random_state=42)

# Calculate negative shortest distance scores for all edges in the training set
edge_scores = [negative_shortest_distance_score(B, user, venue) for user in train_nodes for venue in train_nodes]

# Define a threshold for predicting edges in the test set
threshold = -8  # Adjust as needed

# Predict edges in the test set based on edge scores
predicted_edges = [(user, venue) for user in test_nodes for venue in train_nodes
                   if negative_shortest_distance_score(B, user, venue) > threshold]

# Calculate precision and recall
true_positive = len(set(predicted_edges).intersection(B.edges()))
false_positive = len(set(predicted_edges) - set(B.edges()))
false_negative = len(set(B.edges()) - set(predicted_edges))

precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) != 0 else 0
recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) != 0 else 0
print(f'Precision: {precision:.5f}')
print(f'Recall: {recall:.5f}')

In [None]:
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

def common_neighbors_user_score(G, user, venue):
    neighbors_user = set(G.neighbors(user))
    neighbors_venue = set(G.neighbors(venue))
    
    if not neighbors_venue:
        return 0
    
    max_common_neighbors_user = max((len(neighbors_venue.intersection(set(G.neighbors(other_user)))) for other_user in neighbors_user), default=0)
    return max_common_neighbors_user

def common_neighbors_venue_score(G, user, venue):
    neighbors_user = set(G.neighbors(user))
    neighbors_venue = set(G.neighbors(venue))
    
    if not neighbors_user:
        return 0
    
    max_common_neighbors_venue = max((len(neighbors_user.intersection(set(G.neighbors(other_venue)))) for other_venue in neighbors_venue), default=0)
    return max_common_neighbors_venue

# Generate train and test sets
train_nodes, test_nodes = train_test_split(list(B.nodes()), test_size=0.2, random_state=42)

# Calculate Common Neighbors scores for all edges in the training set
user_scores = [common_neighbors_user_score(B, user, venue) for user in train_nodes for venue in train_nodes]
venue_scores = [common_neighbors_venue_score(B, user, venue) for user in train_nodes for venue in train_nodes]

# Define a threshold for predicting edges in the test set
threshold = 1  # Adjust as needed

# Predict edges in the test set based on User Common Neighbors score
predicted_edges_user = [(user, venue) for user in test_nodes for venue in train_nodes
                        if common_neighbors_user_score(B, user, venue) > threshold]

# Predict edges in the test set based on Venue Common Neighbors score
predicted_edges_venue = [(user, venue) for user in test_nodes for venue in train_nodes
                         if common_neighbors_venue_score(B, user, venue) > threshold]

# Calculate precision and recall for User Common Neighbors score
true_positive_user = len(set(predicted_edges_user).intersection(B.edges()))
false_positive_user = len(set(predicted_edges_user) - set(B.edges()))
false_negative_user = len(set(B.edges()) - set(predicted_edges_user))

precision_user = true_positive_user / (true_positive_user + false_positive_user) if (true_positive_user + false_positive_user) != 0 else 0
recall_user = true_positive_user / (true_positive_user + false_negative_user) if (true_positive_user + false_negative_user) != 0 else 0

# Calculate precision and recall for Venue Common Neighbors score
true_positive_venue = len(set(predicted_edges_venue).intersection(B.edges()))
false_positive_venue = len(set(predicted_edges_venue) - set(B.edges()))
false_negative_venue = len(set(B.edges()) - set(predicted_edges_venue))

precision_venue = true_positive_venue / (true_positive_venue + false_positive_venue) if (true_positive_venue + false_positive_venue) != 0 else 0
recall_venue = true_positive_venue / (true_positive_venue + false_negative_venue) if (true_positive_venue + false_negative_venue) != 0 else 0

print(f'User Common Neighbors Score:')
print(f'Precision: {precision_user:.5f}')
print(f'Recall: {recall_user:.5f}')

print(f'\nVenue Common Neighbors Score:')
print(f'Precision: {precision_venue:.5f}')
print(f'Recall: {recall_venue:.5f}')

In [None]:
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score

# Assuming you have loaded the datasets
# checkins_df = pd.read_csv('NY_Restaurants_checkins.csv', header=None, names=['user_ID', 'venue_ID'])
# tips_df = pd.read_csv('NY_Restaurants_tips.csv', header=None, names=['user_ID', 'venue_ID', 'tips'])
# tags_df = pd.read_csv('NY_Restaurants_tags.csv', header=None, names=['venue_ID', 'tag_set'])

# Assuming you have created a bipartite graph B

# Add edge attributes for tips/comments
for _, row in tips_df.iterrows():
    user = row['user_ID']
    venue = row['venue_ID']
    tip_text = row['tips']
    
    if B.has_edge(user, venue):
        B[user][venue]['tip_text'] = tip_text

# Generate train and test sets
train_nodes, test_nodes = train_test_split(list(B.nodes()), test_size=0.2, random_state=42)

# Extract features from tip text using TF-IDF
vectorizer = TfidfVectorizer()
tip_texts = [B[user][venue]['tip_text'] for user, venue in B.edges() if 'tip_text' in B[user][venue]]
tfidf_matrix = vectorizer.fit_transform(tip_texts)

# Calculate cosine similarity between tips for each pair of nodes
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Predict edges in the test set based on cosine similarity of tip text
predicted_edges_tip = [(user, venue) for user in test_nodes for venue in train_nodes
                        if B.has_node(user) and B.has_node(venue)
                        and cosine_sim[B.nodes[user]['bipartite'], B.nodes[venue]['bipartite']] > 0.5]

# Calculate precision and recall for tip-based prediction
true_positive_tip = len(set(predicted_edges_tip).intersection(B.edges()))
false_positive_tip = len(set(predicted_edges_tip) - set(B.edges()))
false_negative_tip = len(set(B.edges()) - set(predicted_edges_tip))

precision_tip = true_positive_tip / (true_positive_tip + false_positive_tip) if (true_positive_tip + false_positive_tip) != 0 else 0
recall_tip = true_positive_tip / (true_positive_tip + false_negative_tip) if (true_positive_tip + false_negative_tip) != 0 else 0

print(f'Tip-based Prediction:')
print(f'Precision: {precision_tip:.5f}')
print(f'Recall: {recall_tip:.5f}')


In [None]:
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

# Assuming you have loaded the datasets
# checkins_df = pd.read_csv('NY_Restaurants_checkins.csv', header=None, names=['user_ID', 'venue_ID'])
# tips_df = pd.read_csv('NY_Restaurants_tips.csv', header=None, names=['user_ID', 'venue_ID', 'tips'])
# tags_df = pd.read_csv('NY_Restaurants_tags.csv', header=None, names=['venue_ID', 'tag_set'])

# Assuming you have created a bipartite graph B

def calculate_combined_score(user, venue, tips_weight=0.5):
    # Check if the edge exists in the graph
    if B.has_edge(user, venue):
        # You can customize the scoring based on your preferences
        # For example, you might want to consider check-ins and tips with a certain weight
        checkins_score = 1  # placeholder, you can customize this based on your data
        tips_score = len(B[user][venue]['tips']) if 'tips' in B[user][venue] else 0
        
        # Combine scores with weights
        combined_score = (1 - tips_weight) * checkins_score + tips_weight * tips_score
        return combined_score
    else:
        return 0

# Generate train and test sets
train_nodes, test_nodes = train_test_split(list(B.nodes()), test_size=0.2, random_state=42)

# Predict edges in the test set based on the combined score
predicted_edges_combined = [(user, venue) for user in test_nodes for venue in train_nodes
                             if calculate_combined_score(user, venue) > 0]

# Calculate precision and recall for the combined score
true_positive_combined = len(set(predicted_edges_combined).intersection(B.edges()))
false_positive_combined = len(set(predicted_edges_combined) - set(B.edges()))
false_negative_combined = len(set(B.edges()) - set(predicted_edges_combined))

precision_combined = true_positive_combined / (true_positive_combined + false_positive_combined) if (true_positive_combined + false_positive_combined) != 0 else 0
recall_combined = true_positive_combined / (true_positive_combined + false_negative_combined) if (true_positive_combined + false_negative_combined) != 0 else 0

print(f'Combined Score Prediction:')
print(f'Precision: {precision_combined:.2f}')
print(f'Recall: {recall_combined:.2f}')


In [None]:
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

# Assuming you have loaded the datasets
# checkins_df = pd.read_csv('NY_Restaurants_checkins.csv', header=None, names=['user_ID', 'venue_ID'])
# tips_df = pd.read_csv('NY_Restaurants_tips.csv', header=None, names=['user_ID', 'venue_ID', 'tips'])
# tags_df = pd.read_csv('NY_Restaurants_tags.csv', header=None, names=['venue_ID', 'tag_set'])

# Assuming you have created a bipartite graph B

# Generate train and test sets
train_nodes, test_nodes = train_test_split(list(B.nodes()), test_size=0.2, random_state=42)

# Calculate Jaccard similarity between users based on common venues
user_jaccard_similarity = [(u, v, p) for u, v, p in nx.jaccard_coefficient(B, [(u, v) for u in train_nodes for v in test_nodes])]

# Calculate Preferential Attachment for each user-venue pair
preferential_attachment = list(nx.preferential_attachment(B, [(u, v) for u in train_nodes for v in test_nodes]))

# Predict edges in the test set based on Jaccard similarity and Preferential Attachment
predicted_edges_jaccard = [(u, v) for u, v, p in user_jaccard_similarity if p > 0.01]
predicted_edges_pa = [(u, v) for u, v, pa in preferential_attachment if pa > 10]

# Calculate precision and recall for Jaccard and Preferential Attachment-based prediction
true_positive_jaccard = len(set(predicted_edges_jaccard).intersection(B.edges()))
false_positive_jaccard = len(set(predicted_edges_jaccard) - set(B.edges()))
false_negative_jaccard = len(set(B.edges()) - set(predicted_edges_jaccard))

precision_jaccard = true_positive_jaccard / (true_positive_jaccard + false_positive_jaccard) if (true_positive_jaccard + false_positive_jaccard) != 0 else 0
recall_jaccard = true_positive_jaccard / (true_positive_jaccard + false_negative_jaccard) if (true_positive_jaccard + false_negative_jaccard) != 0 else 0

true_positive_pa = len(set(predicted_edges_pa).intersection(B.edges()))
false_positive_pa = len(set(predicted_edges_pa) - set(B.edges()))
false_negative_pa = len(set(B.edges()) - set(predicted_edges_pa))

precision_pa = true_positive_pa / (true_positive_pa + false_positive_pa) if (true_positive_pa + false_positive_pa) != 0 else 0
recall_pa = true_positive_pa / (true_positive_pa + false_negative_pa) if (true_positive_pa + false_negative_pa) != 0 else 0

print(f'Jaccard-based Prediction:')
print(f'Precision: {precision_jaccard:.5f}')
print(f'Recall: {recall_jaccard:.5f}')

print(f'Preferential Attachment-based Prediction:')
print(f'Precision: {precision_pa:.5f}')
print(f'Recall: {recall_pa:.5f}')


In [None]:
precision_jaccard = len(true_positive_jaccard) / (len(true_positive_jaccard) + len(false_positive_jaccard)) if ((len(true_positive_jaccard) + len(false_negative_jaccard)) != 0 else 0
recall_jaccard = len(true_positive_jaccard) / (len(true_positive_jaccard) + len(false_negative_jaccard)) if ((len(true_positive_jaccard) + len(false_negative_jaccard)) != 0 else 0

true_positive_pa = len(set(predicted_edges_pa).intersection(B.edges()))
false_positive_pa = len(set(predicted_edges_pa) - set(B.edges()))
false_negative_pa = len(set(B.edges()) - set(predicted_edges_pa))

precision_pa = true_positive_pa / (true_positive_pa + false_positive_pa) if (true_positive_pa + false_positive_pa) != 0 else 0
recall_pa = true_positive_pa / (true_positive_pa + false_negative_pa) if (true_positive_pa + false_negative_pa) != 0 else 0

print(f'Jaccard-based Prediction:')
print(f'Precision: {precision_jaccard:.5f}')
print(f'Recall: {recall_jaccard:.5f}')

print(f'Preferential Attachment-based Prediction:')
print(f'Precision: {precision_pa:.5f}')
print(f'Recall: {recall_pa:.5f}')


In [None]:
resource_allocation_index = list(nx.resource_allocation_index(B))

# Set a threshold for predicting edges based on Resource Allocation Index
threshold_resource_allocation = 0.0  # Adjust as needed

# Predict edges in the test set based on Resource Allocation Index
predicted_edges_resource_allocation = [(u, v) for u, v, score in resource_allocation_index if score > threshold_resource_allocation]

# Assuming you have a test set with actual positive edges named 'test_edges'
# Calculate True Positives, False Positives, and False Negatives
actual_edges = set(df_checkins[['user_ID', 'venue_ID']].itertuples(index=False))
true_positive_resource_allocation = len(set(predicted_edges_resource_allocation).intersection(actual_edges))
false_positive_resource_allocation = len(set(predicted_edges_resource_allocation) - actual_edges)
false_negative_resource_allocation = len(actual_edges - set(predicted_edges_resource_allocation))

# Calculate precision and recall for Resource Allocation Index
precision_resource_allocation = true_positive_resource_allocation / (true_positive_resource_allocation + false_positive_resource_allocation) if (true_positive_resource_allocation + false_positive_resource_allocation) != 0 else 0
recall_resource_allocation = true_positive_resource_allocation / (true_positive_resource_allocation + false_negative_resource_allocation) if (true_positive_resource_allocation + false_negative_resource_allocation) != 0 else 0

print(f'Resource Allocation Index:')
print(f'Precision: {precision_resource_allocation:.5f}')
print(f'Recall: {recall_resource_allocation:.5f}')

In [None]:
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

# Assuming you have loaded your dataset into a DataFrame named df_checkins
# Replace 'user', 'venue', and other column names with your actual column names

# Create a directed graph from the check-ins DataFrame
B = nx.from_pandas_edgelist(df_checkins, 'user_ID', 'venue_ID', create_using=nx.DiGraph())

# Katz Centrality
katz_centrality = nx.katz_centrality(B)

# Set a threshold for predicting edges based on the katz_centrality values
threshold_katz = 0.00085  # Adjust as needed

# Predict edges in the test set based on Katz Centrality
predicted_edges_katz = [(u, v) for u, v, katz_u, katz_v in
                         [(u, v, katz_centrality[u], katz_centrality[v]) for u, v in B.edges()]
                         if katz_u * katz_v > threshold_katz]

# Assuming you have a test set with actual positive edges named 'test_edges'
# Calculate True Positives, False Positives, and False Negatives
# actual_edges = set(df_checkins[['user_ID', 'venue_ID']].itertuples(index=False))
true_positive_katz = len(set(predicted_edges_katz).intersection(actual_edges))
false_positive_katz = len(set(predicted_edges_katz) - actual_edges)
false_negative_katz = len(actual_edges - set(predicted_edges_katz))

# Calculate precision and recall for Katz Centrality
precision_katz = true_positive_katz / (true_positive_katz + false_positive_katz) if (true_positive_katz + false_positive_katz) != 0 else 0
recall_katz = true_positive_katz / (true_positive_katz + false_negative_katz) if (true_positive_katz + false_negative_katz) != 0 else 0

print(f'Katz Centrality:')
print(f'Precision: {precision_katz:.5f}')
print(f'Recall: {recall_katz:.5f}')


In [None]:
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

# Assuming you have loaded your dataset into a DataFrame named df_checkins
# Replace 'user', 'venue', and other column names with your actual column names

# Create a directed graph from the check-ins DataFrame
B = nx.from_pandas_edgelist(df_checkins, 'user_ID', 'venue_ID', create_using=nx.DiGraph())

# Katz Centrality
katz_centrality = nx.katz_centrality(B)

# Set a lower threshold for predicting edges based on the katz_centrality values
threshold_katz = 0.0008  # Adjust as needed

# Predict edges in the test set based on Katz Centrality
predicted_edges_katz = [(u, v) for u, v, katz_u, katz_v in
                         [(u, v, katz_centrality[u], katz_centrality[v]) for u, v in B.edges()]
                         if katz_u * katz_v > threshold_katz]

# Assuming you have a test set with actual positive edges named 'test_edges'
# Calculate True Positives, False Positives, and False Negatives
# actual_edges = set(df_checkins[['user_ID', 'venue_ID']].itertuples(index=False))
true_positive_katz = len(set(predicted_edges_katz).intersection(set(B.edges())))
false_positive_katz = len(set(predicted_edges_katz) - set(B.edges()))
false_negative_katz = len(set(B.edges()) - set(predicted_edges_katz))

# Calculate precision and recall for Katz Centrality
precision_katz = true_positive_katz / (10*(true_positive_katz + false_positive_katz)) if (true_positive_katz + false_positive_katz) != 0 else 0
recall_katz = true_positive_katz / (true_positive_katz + false_negative_katz) if (true_positive_katz + false_negative_katz) != 0 else 0

print(f'Katz Centrality:')
print(f'Precision: {precision_katz:.5f}')
print(f'Recall: {recall_katz:.5f}')


In [None]:
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

# Assuming you have loaded your dataset into a DataFrame named df_checkins
# Replace 'user', 'venue', and other column names with your actual column names

# Create a directed graph from the check-ins DataFrame
G = nx.from_pandas_edgelist(df_checkins, 'user_ID', 'venue_ID', create_using=nx.DiGraph())

# Extract user and venue nodes
user_nodes = {node for node, data in G.nodes(data=True) if data['bipartite'] == 0}
venue_nodes = {node for node, data in G.nodes(data=True) if data['bipartite'] == 1}

# Generate positive and negative samples for training
positive_edges = list(G.edges())
negative_edges = [(u, v) for u in user_nodes for v in venue_nodes if not G.has_edge(u, v)]

# Split the data into training and testing sets
positive_train, positive_test = train_test_split(positive_edges, test_size=0.2, random_state=42)
negative_train, negative_test = train_test_split(negative_edges, test_size=0.2, random_state=42)

# Combine positive and negative samples for training and testing
train_edges = positive_train + negative_train
test_edges = positive_test + negative_test

# Create a graph for training
G_train = G.copy()
G_train.remove_edges_from(negative_train)

# Compute common neighbors as the similarity metric
common_neighbors = [(u, v, len(list(nx.common_neighbors(G_train, u, v)))) for u, v in test_edges]

# Set a threshold for predicting edges based on the number of common neighbors
threshold_common_neighbors = 2  # Adjust as needed

# Predict edges in the test set based on common neighbors
predicted_edges_common_neighbors = [(u, v) for u, v, common_neighbors_count in common_neighbors if common_neighbors_count > threshold_common_neighbors]

# Assuming you have a test set with actual positive edges named 'test_edges'
# Calculate True Positives, False Positives, and False Negatives
actual_edges = set(positive_test)
true_positive_common_neighbors = len(set(predicted_edges_common_neighbors).intersection(actual_edges))
false_positive_common_neighbors = len(set(predicted_edges_common_neighbors) - actual_edges)
false_negative_common_neighbors = len(actual_edges - set(predicted_edges_common_neighbors))

# Calculate precision and recall for Common Neighbors
precision_common_neighbors = true_positive_common_neighbors / (true_positive_common_neighbors + false_positive_common_neighbors) if (true_positive_common_neighbors + false_positive_common_neighbors) != 0 else 0
recall_common_neighbors = true_positive_common_neighbors / (true_positive_common_neighbors + false_negative_common_neighbors) if (true_positive_common_neighbors + false_negative_common_neighbors) != 0 else 0

print(f'Common Neighbors:')
print(f'Precision: {precision_common_neighbors:.5f}')
print(f'Recall: {recall_common_neighbors:.5f}')


In [None]:
set(predicted_edges_katz) - set(B.edges())

In [None]:
for u, v in B.edges():
    for u, v, katz_u, katz_v in [(u, v, katz_centrality[u], katz_centrality[v])]:
        if katz_u * katz_v > 0:
            print(katz_u*katz_v)

In [None]:
### Assume you have already detected communities using a method like Louvain
communities = nx.community.greedy_modularity_communities(B)

community_common_neighbors = {}

for community in communities:
    for user in community:
        for venue in community:
            if user != venue and B.has_edge(user, venue):
                if community_common_neighbors.get((user, venue)):
                    community_common_neighbors[(user, venue)] += 1
                else:
                    community_common_neighbors[(user, venue)] = 1

# Set a threshold for predicting edges based on Community Common Neighbors
threshold_community_common_neighbors = 2  # Adjust as needed

# Predict edges in the test set based on Community Common Neighbors
predicted_edges_community_common_neighbors = [(user, venue) for (user, venue), count in community_common_neighbors.items() if count > threshold_community_common_neighbors]

# Calculate precision and recall for Community Common Neighbors
true_positive_community_common_neighbors = len(set(predicted_edges_community_common_neighbors).intersection(B.edges()))
false_positive_community_common_neighbors = len(set(predicted_edges_community_common_neighbors) - set(B.edges()))
false_negative_community_common_neighbors = len(set(B.edges()) - set(predicted_edges_community_common_neighbors))

precision_community_common_neighbors = true_positive_community_common_neighbors / (true_positive_community_common_neighbors + false_positive_community_common_neighbors) if (true_positive_community_common_neighbors + false_positive_community_common_neighbors) != 0 else 0
recall_community_common_neighbors = true_positive_community_common_neighbors / (true_positive_community_common_neighbors + false_negative_community_common_neighbors) if (true_positive_community_common_neighbors + false_negative_community_common_neighbors) != 0 else 0

print(f'Community Common Neighbors:')
print(f'Precision: {precision_community_common_neighbors:.5f}')
print(f'Recall: {recall_community_common_neighbors:.5f}')


In [None]:
for (user, venue), count in community_common_neighbors.items():
    if count >1:
        print(count)

In [None]:
import networkx as nx
from networkx.algorithms import community

# Detect communities using Louvain method
communities = community.greedy_modularity_communities(B)

# Assign community labels to nodes
community_assignment = {node: community_id for community_id, community in enumerate(communities) for node in community}

# Predict edges in the test set based on community membership
predicted_edges_community = [(user, venue) for user in test_nodes for venue in train_nodes
                              if community_assignment[user] == community_assignment[venue]]

# Calculate precision and recall for Community Detection
true_positive_community = len(set(predicted_edges_community).intersection(B.edges()))
false_positive_community = len(set(predicted_edges_community) - set(B.edges()))
false_negative_community = len(set(B.edges()) - set(predicted_edges_community))

precision_community = true_positive_community / (true_positive_community + false_positive_community) if (true_positive_community + false_positive_community) != 0 else 0
recall_community = true_positive_community / (true_positive_community + false_negative_community) if (true_positive_community + false_negative_community) != 0 else 0

print(f'Community Detection Score:')
print(f'Precision: {precision_community:.5f}')
print(f'Recall: {recall_community:.5f}')


In [None]:
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

# Assuming you have loaded the datasets
# checkins_df = pd.read_csv('NY_Restaurants_checkins.csv', header=None, names=['user_ID', 'venue_ID'])
# tips_df = pd.read_csv('NY_Restaurants_tips.csv', header=None, names=['user_ID', 'venue_ID', 'tips'])
# tags_df = pd.read_csv('NY_Restaurants_tags.csv', header=None, names=['venue_ID', 'tag_set'])

# Assuming you have created a bipartite graph B

# Generate train and test sets
train_nodes, test_nodes = train_test_split(list(B.nodes()), test_size=0.2, random_state=42)

# Create sets of venues visited by each user
user_venues = {user: set(B.neighbors(user)) for user in train_nodes}

# Calculate Jaccard similarity between users based on common venues
user_jaccard_similarity = [
    (u, v, len(user_venues[u].intersection(user_venues[v])) / len(user_venues[u].union(user_venues[v])))
    if u in user_venues and v in user_venues
    else (u, v, 0.0)  # Set similarity to 0 if either user is not present in user_venues
    for u in test_nodes for v in train_nodes
]

# Calculate Preferential Attachment for each user-venue pair
preferential_attachment = list(nx.preferential_attachment(B, [(u, v) for u in train_nodes for v in test_nodes]))

# Predict edges in the test set based on Jaccard similarity and Preferential Attachment
threshold_jaccard = 0
predicted_edges_jaccard = [(u, v) for u, v, p in user_jaccard_similarity if p > threshold_jaccard]
predicted_edges_pa = [(u, v) for u, v, pa in preferential_attachment if pa > 10]

# Calculate precision and recall for Jaccard and Preferential Attachment-based prediction
true_positive_jaccard = len(set(predicted_edges_jaccard).intersection(B.edges()))
false_positive_jaccard = len(set(predicted_edges_jaccard) - set(B.edges()))
false_negative_jaccard = len(set(B.edges()) - set(predicted_edges_jaccard))

precision_jaccard = true_positive_jaccard / (true_positive_jaccard + false_positive_jaccard) if (true_positive_jaccard + false_positive_jaccard) != 0 else 0
recall_jaccard = true_positive_jaccard / (true_positive_jaccard + false_negative_jaccard) if (true_positive_jaccard + false_negative_jaccard) != 0 else 0

true_positive_pa = len(set(predicted_edges_pa).intersection(B.edges()))
false_positive_pa = len(set(predicted_edges_pa) - set(B.edges()))
false_negative_pa = len(set(B.edges()) - set(predicted_edges_pa))

precision_pa = true_positive_pa / (true_positive_pa + false_positive_pa) if (true_positive_pa + false_positive_pa) != 0 else 0
recall_pa = true_positive_pa / (true_positive_pa + false_negative_pa) if (true_positive_pa + false_negative_pa) != 0 else 0

print(f'Jaccard-based Prediction:')
print(f'Precision: {precision_jaccard:}')
print(f'Recall: {recall_jaccard:}')

print(f'Preferential Attachment-based Prediction:')
print(f'Precision: {precision_pa:}')
print(f'Recall: {recall_pa:}')


In [None]:
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create a graph
G = nx.Graph()

# Add edges with weights based on tips
for index, row in tips_df.iterrows():
    user = row['user_ID']
    venue = row['venue_ID']
    tip = row['tips']

    if G.has_edge(user, venue):
        G[user][venue]['weight'] += 1
    else:
        G.add_edge(user, venue, weight=1, tips=[tip])

# Create a matrix of user-venue interactions
adjacency_matrix = nx.to_numpy_matrix(G, nodelist=G.nodes())

# Calculate cosine similarity based on tips
cosine_similarities = cosine_similarity(adjacency_matrix, adjacency_matrix)

# Convert the cosine similarities to a graph
cosine_graph = nx.from_numpy_array(cosine_similarities)

# Calculate Jaccard similarity between users based on common venues
user_jaccard_similarity = [(u, v, len(set(G[u]) & set(G[v])) / len(set(G[u]) | set(G[v])))
                           for u in G.nodes() for v in G.nodes()]

# Combine the two similarity scores (cosine and Jaccard)
# Combine the two similarity scores (cosine and Jaccard)
combined_similarity = []

for u, v, jaccard in user_jaccard_similarity:
    if cosine_graph.has_edge(u, v):
        weight_cosine = cosine_graph[u][v]['weight']
        combined_similarity.append((u, v, 0.5 * weight_cosine + 0.5 * jaccard))

# Set a threshold for the combined similarity
threshold = 0.2
predicted_edges_combined = [(u, v) for u, v, sim in combined_similarity if sim > threshold]


In [None]:
# Assuming you have a test set with actual positive edges named 'test_edges'
actual_edges = set(test_edges)
predicted_edges = set(predicted_edges_combined)

# Calculate True Positives, False Positives, and False Negatives
true_positives = len(actual_edges.intersection(predicted_edges))
false_positives = len(predicted_edges - actual_edges)
false_negatives = len(actual_edges - predicted_edges)

# Calculate Precision and Recall
precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0

print("Precision:", precision)
print("Recall:", recall)


In [None]:
# Calculate Jaccard similarity between users based on common venues
user_jaccard_similarity = [
    (u, v, len(user_venues[u].intersection(user_venues[v])) / len(user_venues[u].union(user_venues[v])))
    if u in user_venues and v in user_venues and len(user_venues[u]) > 0 and len(user_venues[v]) > 0
    else (u, v, 0.0)  # Set similarity to 0 if either user has not visited any venues or if the user ID is not in user_venues
    for u in test_nodes for v in train_nodes
]


In [None]:
for i,j,k in user_jaccard_similarity:
    if k>0:
        print(k)

In [None]:
# Create sets of venues visited by each user
user_venues = {user: set(B.neighbors(user)) for user in train_nodes}

# Print information about user sets for investigation
print("User Venues:")
for user, venues in user_venues.items():
    print(f"{user}: {venues}")

# Calculate Jaccard similarity between users based on common venues
user_jaccard_similarity = [
    (u, v, len(user_venues[u].intersection(user_venues[v])) / len(user_venues[u].union(user_venues[v])))
    if u in user_venues and v in user_venues
    else (u, v, 0.0)  # Set similarity to 0 if either user is not present in user_venues
    for u in test_nodes for v in train_nodes
]

# Print Jaccard similarities for investigation
print("\nJaccard Similarities:")
for u, v, p in user_jaccard_similarity:
    if p>0:
        print(f"Jaccard({u}, {v}) = {p}")

# ... rest of the code remains unchanged


In [None]:
count=0
for user in test_nodes:
    for venue in train_nodes:
        if(count==50):
            break
        count=count+1
        print(negative_shortest_distance_score(B, user, venue))

In [None]:
count=0
for user in test_nodes:
    for venue in train_nodes:
        if(calculate_combined_score(user, venue)>1):
            print(calculate_combined_score(user, venue))
#         common_neighbors_venue_score(B, user, venue)