# Bibliometric Citation Analysis Project

With this project, I was able to search and filter research papers suitable for a particular research objective. In the first half, the research objective from the Coursera project guidelines was provided and Gephi was used to build and visualize the entire network for the Coursera piece of the project. The available Gephi exports are in the bibliometric data folder and the final network image shown below.

![title](bibliometric_data/CitationAnalysis.png)

For this next part, NetworkX will be used.


## Import Dependencies and Bibliometric Data

In [105]:
import networkx as nx
import pandas as pd
import plotly.graph_objects as go

In [58]:
# Read all sheets in the excel file
df = pd.read_excel(open('./bibliometric_data/citation_data.xlsx','rb'))
df.head()

Unnamed: 0,Linkage,Authors,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,Page start,...,DOI,Link,ISSN,ISBN,CODEN,Document Type,Publication Stage,Access Type,Source,EID
0,p1,"Fehr E., Schmidt K.M.",56211794400;24308313900;,"A theory of fairness, competition, and coopera...",1999,Quarterly Journal of Economics,114.0,3.0,,817.0,...,10.1162/003355399556151,https://www.scopus.com/inward/record.uri?eid=2...,335533.0,,QJECA,Article,Final,,Scopus,2-s2.0-0000773694
1,r1,Homans G.C.,,[No title available],1961,Social Behavior: Its Elementary Forms,,,,,...,,,,,,,,,Scopus,2-s2.0-0004161557
2,r1,"Tversky A., Kahneman D.",,Loss aversion in riskless choice: A reference-...,1991,Quarterly Journal of Economics,106.0,4.0,,1039.0,...,,,,,,,,,Scopus,2-s2.0-0001076162
3,r1,Rabin M.,,Incorporating fairness into game theory and ec...,1993,American Economic Review,83.0,5.0,,1281.0,...,,,,,,,,,Scopus,2-s2.0-0000832255
4,r1,"Güth W., Schmittberger R., Schwarze B.",7004431518;24523628900;36931141400;,An experimental analysis of ultimatum bargaining,1982,Journal of Economic Behavior and Organization,3.0,4.0,,367.0,...,10.1016/0167-2681(82)90011-7,https://www.scopus.com/inward/record.uri?eid=2...,1672681.0,,JEBOD,Article,Final,,Scopus,2-s2.0-41449091490


## Select Features and Transform Data

The columns of interest are Linkage, Authors, Year, EID, and then we will create a new composite column called `Label` containing the combination of authors_year separated by a comma and space.

In [59]:
df['Label'] = df.apply(lambda row: row.Authors + ", " + str(row.Year), axis=1)

features = ["Linkage", "Authors", "Year", "EID", "Label"]

df[features]

Unnamed: 0,Linkage,Authors,Year,EID,Label
0,p1,"Fehr E., Schmidt K.M.",1999,2-s2.0-0000773694,"Fehr E., Schmidt K.M., 1999"
1,r1,Homans G.C.,1961,2-s2.0-0004161557,"Homans G.C., 1961"
2,r1,"Tversky A., Kahneman D.",1991,2-s2.0-0001076162,"Tversky A., Kahneman D., 1991"
3,r1,Rabin M.,1993,2-s2.0-0000832255,"Rabin M., 1993"
4,r1,"Güth W., Schmittberger R., Schwarze B.",1982,2-s2.0-41449091490,"Güth W., Schmittberger R., Schwarze B., 1982"
...,...,...,...,...,...
135,r4,Savin N.E.,1984,2-s2.0-0009981024,"Savin N.E., 1984"
136,r4,"Kim P.J., Jennrich R.I.",1973,2-s2.0-0003329984,"Kim P.J., Jennrich R.I., 1973"
137,r4,Thaler R.H.,1987,2-s2.0-0141728960,"Thaler R.H., 1987"
138,r4,"Binmore K., Shaked A., Sutton J.",1984,2-s2.0-84909727619,"Binmore K., Shaked A., Sutton J., 1984"


## Build the Primary Networks

P1 is associated to R1, P2 goes with R2, etc all the way to P4/R4. As shown in the Excel sheet, P2 is also marked with an R1, further complicating the extraction.

In [88]:
def assign_group(linkage):
    groups = []
    if 'p1' in linkage or "r1" in linkage:
        groups.append('paper 1')
    if 'p2' in linkage or "r2" in linkage:
        groups.append('paper 2')
    if 'p3' in linkage or "r3" in linkage:
        groups.append('paper 3')
    else:
        groups.append('paper 4')
    
    return groups

features.append('Groups')

In [89]:
# Explode the DataFrame based on the new groups
df['Groups'] = df['Linkage'].apply(assign_group)
df_exploded = df.explode('Groups').reset_index(drop=True)

In [91]:
first_group = df_exploded[df_exploded["Groups"] == "paper 1"][features]
second_group = df_exploded[df_exploded["Groups"] == "paper 2"][features]
third_group = df_exploded[df_exploded["Groups"] == "paper 3"][features]
fourth_group = df_exploded[df_exploded["Groups"] == "paper 4"][features]

first_group

Unnamed: 0,Linkage,Authors,Year,EID,Label,Groups
0,p1,"Fehr E., Schmidt K.M.",1999,2-s2.0-0000773694,"Fehr E., Schmidt K.M., 1999",paper 1
2,r1,Homans G.C.,1961,2-s2.0-0004161557,"Homans G.C., 1961",paper 1
4,r1,"Tversky A., Kahneman D.",1991,2-s2.0-0001076162,"Tversky A., Kahneman D., 1991",paper 1
6,r1,Rabin M.,1993,2-s2.0-0000832255,"Rabin M., 1993",paper 1
8,r1,"Güth W., Schmittberger R., Schwarze B.",1982,2-s2.0-41449091490,"Güth W., Schmittberger R., Schwarze B., 1982",paper 1
...,...,...,...,...,...,...
152,r1,Runciman W.G.,1966,2-s2.0-0040002841,"Runciman W.G., 1966",paper 1
154,r1,"Falkinger J., Fehr E., Gächter S., Winter-Ebme...",0,2-s2.0-85034132272,"Falkinger J., Fehr E., Gächter S., Winter-Ebme...",paper 1
156,r1,Charness G.,0,2-s2.0-85034129033,"Charness G., 0",paper 1
158,r1,Festinger L.,1954,2-s2.0-4644234910,"Festinger L., 1954",paper 1


In [92]:
second_group.head(5)

Unnamed: 0,Linkage,Authors,Year,EID,Label,Groups
161,r1/p2,"Berg J., Dickhaut J., McCabe K.",1995,2-s2.0-58149326397,"Berg J., Dickhaut J., McCabe K., 1995",paper 2
163,r2,Schelling T.C.,1960,2-s2.0-0004165120,"Schelling T.C., 1960",paper 2
165,r2,Rabin M.,1993,2-s2.0-0000832255,"Rabin M., 1993",paper 2
167,r2,Arrow K.J.,1974,2-s2.0-0004197365,"Arrow K.J., 1974",paper 2
169,r2,"Cosmides L., Tooby J.",1992,2-s2.0-0002230120,"Cosmides L., Tooby J., 1992",paper 2


In [161]:
def build_network(paper_group, primary_paper_label: str):
    primary_paper = paper_group[paper_group["Linkage"] == primary_paper_label]
    other_group_papers = paper_group[paper_group['Linkage'] != primary_paper_label]

    paper_network = nx.Graph()

    paper_network.add_node(
        primary_paper["Label"].item(), 
        group=primary_paper["Groups"].item()
        )

    for index, row in other_group_papers.iterrows():
        paper_network.add_node(
            row["Label"], 
            group=row["Groups"]
        )

        paper_network.add_edge(
            primary_paper["Label"].item(),
            row["Label"]
        )

    return paper_network

In [166]:
# Grouped Papers
paper_1_network = build_network(first_group, "p1")
paper_2_network = build_network(second_group, "r1/p2")
paper_3_network = build_network(third_group, "p3")
paper_4_network = build_network(fourth_group, "p4")

In [167]:
colors = {
    'paper 1': 'red', 
    'paper 2': 'blue', 
    'paper 3': 'green', 
    'paper 4': 'grey'
    }
shapes = {
    'paper 1': 'circle', 
    'paper 2': 'square', 
    'paper 3': 'triangle-up', 
    'paper 4': 'cross'
    }
group_positions = {
    'paper 1': (0, 0),
    'paper 2': (2, 2),
    'paper 3': (-2, -2),
    'paper 4': (4, 4)
}
unique_groups = [
        {"graph": paper_1_network, "group": "paper 1"},
        {"graph": paper_2_network, "group": "paper 2"},
        {"graph": paper_3_network, "group": "paper 3"},
        {"graph": paper_4_network, "group": "paper 4"}
    ]

In [198]:
def connect_graphs_by_labels(graphs, df):
    # Create a global graph to combine all graphs and connect nodes by labels
    G_global = nx.Graph()
    
    # Iterate through each graph and add nodes to the global graph
    for graph in graphs:
        for node, data in graph.nodes(data=True):
            if node in G_global:
                # If the node already exists in the global graph, connect it to the existing node
                G_global.add_edges_from([(node, existing_node) for existing_node in G_global if node == existing_node])
            else:
                G_global.add_node(node, **data)

    return G_global

In [202]:
def draw_citation_graphs(citation_graphs):
    fig = go.Figure()
    G_global = nx.Graph()

    # Create separate graphs for each paper group
    for graph_entry in citation_graphs:
        graph = graph_entry["graph"]
        group = graph_entry["group"]

        # Create positions for nodes
        pos = nx.spring_layout(graph)

        # Adjust positions to separate clusters
        for node in pos:
            pos[node][0] += group_positions[group][0]
            pos[node][1] += group_positions[group][1]

        # Add edges to the plot
        edge_x = []
        edge_y = []

        for edge in graph.edges():
            x0, y0 = pos[edge[0]]
            x1, y1 = pos[edge[1]]
            edge_x.append(x0)
            edge_x.append(x1)
            edge_x.append(None)
            edge_y.append(y0)
            edge_y.append(y1)
            edge_y.append(None)

        edge_trace = go.Scatter(
                x=edge_x, y=edge_y,
                line=dict(width=0.5, color='#888'),
                hoverinfo='none',
                mode='lines')

        fig.add_trace(edge_trace)

        # Add nodes to the plot
        node_x = []
        node_y = []
        node_text = []

        for node in graph.nodes():
                x, y = pos[node]
                node_x.append(x)
                node_y.append(y)
                node_text.append(node)

        node_trace = go.Scatter(
            x=node_x, y=node_y,
            mode='markers',
                text=node_text,
            textposition='top center',
            hoverinfo='text',
            marker=dict(
                showscale=False,
                color=colors[group],
                size=4,
                symbol=shapes[group],
                line_width=2))

        fig.add_trace(node_trace)

    # Connect nodes across different groups
    cross_group_edges = []
    for name, group_data in df_exploded.groupby('Label'):
        nodes = [f"{name}" for _, row in group_data.iterrows()]
        for i in range(len(nodes)):
            for j in range(i + 1, len(nodes)):
                if nodes[i] in G_global and nodes[j] in G_global:
                    G_global.add_edge(nodes[i], nodes[j])
                    cross_group_edges.append((nodes[i], nodes[j]))

    # Create positions for global graph
    pos_global = nx.spring_layout(G_global)

    # Adjust positions to separate clusters
    for group, (dx, dy) in group_positions.items():
        for node in G_global.nodes(data=True):
            if node[1]['group'] == group:
                pos_global[node[0]][0] += dx
                pos_global[node[0]][1] += dy

    # Add cross-group edges to the plot
    cross_edge_x = []
    cross_edge_y = []

    for edge in cross_group_edges:
        x0, y0 = pos_global[edge[0]]
        x1, y1 = pos_global[edge[1]]
        cross_edge_x.append(x0)
        cross_edge_x.append(x1)
        cross_edge_x.append(None)
        cross_edge_y.append(y0)
        cross_edge_y.append(y1)
        cross_edge_y.append(None)

    cross_edge_trace = go.Scatter(
        x=cross_edge_x, y=cross_edge_y,
        line=dict(width=1, color='black', dash='dot'),
        hoverinfo='none',
        mode='lines')

    fig.add_trace(cross_edge_trace)

    return fig

In [203]:
fig = draw_citation_graphs(
    unique_groups
)

fig.update_layout(
    title='Bibliographic Citation Graphs',
    titlefont_size=16,
    showlegend=False,
    hovermode='closest',
    margin=dict(b=20, l=5, r=5, t=40),
    annotations=[dict(
        text="",
        showarrow=False,
        xref="paper", yref="paper"
    )],
    xaxis=dict(showgrid=False, zeroline=False),
    yaxis=dict(showgrid=False, zeroline=False)
)

fig.show()

In [None]:
# FUTURE TODO: Link cross-network citations