#1.Summary

In [12]:
import pandas as pd
import plotly.express as px
from collections import Counter

# Read the CSV file (new name)
df = pd.read_csv("New_And_Original_ResearchOutputs.csv")

# Split the data into three sources based on row indices (excluding header):
# WebScrap: rows 1-652 (i.e., df.iloc[0:652]) -> 652 records
# ResearchOutputs: rows 653-2387 (i.e., df.iloc[652:2387]) -> 1735 records
# API: rows 2388-3369 (i.e., df.iloc[2387:3369]) -> 982 records
df_web = df.iloc[0:652]
df_research = df.iloc[652:2387]
df_api = df.iloc[2387:3369]

# Create a DataFrame with source names and record counts
counts = {
    "Source": ["WebScrap", "ResearchOutputs", "API"],
    "Record_Count": [len(df_web), len(df_research), len(df_api)]
}
counts_df = pd.DataFrame(counts)

# Chart 1: Bar chart for record count per data source
fig1 = px.bar(counts_df, x="Source", y="Record_Count", text="Record_Count",
              title="Record Count per Data Source")
fig1.update_traces(texttemplate='%{text}', textposition='outside')
fig1.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig1.show()

# Chart 2: Overall Year Distribution Histogram (if 'year' column exists)
if "year" in df.columns:
    fig2 = px.histogram(df, x="year",
                        title="Overall Year Distribution",
                        labels={"year": "Year"})
    fig2.show()

# Create a new column for the number of authors per article
df["num_authors"] = df["authors"].fillna("").apply(lambda x: len([a for a in x.split(";") if a.strip() != ""]))

# Chart 3: Histogram of Number of Authors per Article (Overall)
fig3 = px.histogram(df, x="num_authors",
                    title="Overall Distribution of Authors per Article",
                    labels={"num_authors": "Number of Authors"})
fig3.show()

# Chart 4: Top 10 Authors by Article Count (Overall)
all_authors = []
df["authors"].dropna().apply(lambda x: all_authors.extend([a.strip() for a in x.split(";") if a.strip() != ""]))
author_counts = Counter(all_authors)
top_authors = author_counts.most_common(10)
top_authors_df = pd.DataFrame(top_authors, columns=["Author", "Article_Count"])
fig4 = px.bar(top_authors_df, x="Author", y="Article_Count", text="Article_Count",
              title="Top 10 Authors by Article Count (Overall)")
fig4.update_traces(texttemplate='%{text}', textposition='outside')
fig4.show()

# Print overall statistics
print("Overall Data Statistics:")
print(counts_df)

Overall Data Statistics:
            Source  Record_Count
0         WebScrap           652
1  ResearchOutputs          1735
2              API           982


In [13]:
# Import all necessary libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from scipy import stats
import os

# Create output directory with explicit error handling
try:
    os.makedirs('output/visualizations', exist_ok=True)
    print("Directory created successfully or already exists")
except Exception as e:
    print(f"Error creating directory: {e}")
    # Fallback to current directory
    output_dir = './'
else:
    output_dir = 'output/visualizations/'

# Function to save and display figure
def save_and_display(fig, filename):
    # Always display the figure
    fig.show()
    
    # Try to save it as well
    try:
        full_path = f'{output_dir}{filename}'
        fig.write_html(full_path)
        print(f"Successfully saved to {full_path}")
    except Exception as e:
        print(f"Error saving figure: {e}")

# 5.2.1 Temporal Analysis
# Count publications by year
year_counts = df['year'].value_counts().sort_index()

# Create bar chart for publication by year
fig_year = px.bar(
    x=year_counts.index, 
    y=year_counts.values,
    title='Distribution of Publications by Year',
    labels={'x': 'Year', 'y': 'Number of Publications'}
)
fig_year.update_layout(template='plotly_white')

# Save and display
save_and_display(fig_year, 'publication_by_year.html')

# Calculate annual growth rate
annual_growth = year_counts.pct_change() * 100
avg_growth_rate = annual_growth.mean()
print(f"Average annual growth rate of publications: {avg_growth_rate:.2f}%")

# Linear regression for time trend
years = np.array(year_counts.index).reshape(-1, 1)
counts = year_counts.values
model = LinearRegression()
model.fit(years, counts)
prediction = model.predict(years)

# Create scatter plot with trend line
fig_trend = px.scatter(
    x=year_counts.index, 
    y=year_counts.values,
    title=f'Publication Trend (Slope: {model.coef_[0]:.2f})',
    labels={'x': 'Year', 'y': 'Number of Publications'}
)
fig_trend.add_trace(
    go.Scatter(
        x=year_counts.index, 
        y=prediction.flatten(), 
        mode='lines', 
        name='Trend Line',
        line=dict(color='red')
    )
)
fig_trend.update_layout(template='plotly_white')

# Save and display
save_and_display(fig_trend, 'publication_trend.html')

# 5.2.2 Institution Analysis
try:
    # Count publications for top 20 institutions
    institution_counts = df.explode('institution_display_names')['institution_display_names'].value_counts().head(20)

    # Create bar chart for top institutions
    fig_inst = px.bar(
        y=institution_counts.index, 
        x=institution_counts.values,
        title='Top 20 Institutions by Publication Count',
        labels={'x': 'Number of Publications', 'y': 'Institution'},
        orientation='h'
    )
    fig_inst.update_layout(template='plotly_white', height=800)

    # Save and display
    save_and_display(fig_inst, 'top_institutions.html')

except Exception as e:
    print(f"Error analyzing institutions: {e}")

# Institution collaboration network analysis
try:
    institution_pairs = []
    for _, row in df.iterrows():
        institutions = row['institution_display_names']
        if isinstance(institutions, list) and len(institutions) > 1:
            for i in range(len(institutions)):
                for j in range(i+1, len(institutions)):
                    institution_pairs.append((institutions[i], institutions[j]))

    # Calculate institution collaboration frequency
    if institution_pairs:
        collaboration_counts = pd.Series(institution_pairs).value_counts().head(15)
        collaboration_df = pd.DataFrame({'Count': collaboration_counts.values})
        collaboration_df['Institution Pair'] = [f"{pair[0]} & {pair[1]}" for pair in collaboration_counts.index]

        # Create bar chart for institution collaborations
        fig_collab = px.bar(
            collaboration_df,
            y='Institution Pair', 
            x='Count',
            title='Top 15 Institution Collaborations',
            labels={'Count': 'Number of Collaborations', 'Institution Pair': 'Institution Pair'},
            orientation='h'
        )
        fig_collab.update_layout(template='plotly_white', height=600)

        # Save and display
        save_and_display(fig_collab, 'institution_collaborations.html')
    else:
        print("No institution pairs found for collaboration analysis")
except Exception as e:
    print(f"Error analyzing institution collaborations: {e}")

# 5.2.3 Keyword Analysis
try:
    # Keyword frequency analysis
    all_keywords = df.explode('keywords')['keywords'].value_counts().head(30)

    # Create bar chart for top keywords
    fig_keyword = px.bar(
        y=all_keywords.index, 
        x=all_keywords.values,
        title='Top 30 Keywords',
        labels={'x': 'Frequency', 'y': 'Keyword'},
        orientation='h'
    )
    fig_keyword.update_layout(template='plotly_white', height=800)

    # Save and display
    save_and_display(fig_keyword, 'top_keywords.html')

    # Keyword co-occurrence network
    keyword_pairs = []
    for _, row in df.iterrows():
        keywords = row['keywords']
        if isinstance(keywords, list) and len(keywords) > 1:
            for i in range(len(keywords)):
                for j in range(i+1, len(keywords)):
                    keyword_pairs.append((keywords[i], keywords[j]))

    # Calculate keyword co-occurrence frequency
    if keyword_pairs:
        cooccurrence_counts = pd.Series(keyword_pairs).value_counts().head(20)
        cooccurrence_df = pd.DataFrame({'Count': cooccurrence_counts.values})
        cooccurrence_df['Keyword Pair'] = [f"{pair[0]} & {pair[1]}" for pair in cooccurrence_counts.index]

        # Create bar chart for keyword co-occurrences
        fig_cooccur = px.bar(
            cooccurrence_df,
            y='Keyword Pair', 
            x='Count',
            title='Top 20 Keyword Co-occurrences',
            labels={'Count': 'Co-occurrence Frequency', 'Keyword Pair': 'Keyword Pair'},
            orientation='h'
        )
        fig_cooccur.update_layout(template='plotly_white', height=700)

        # Save and display
        save_and_display(fig_cooccur, 'keyword_cooccurrences.html')
    else:
        print("No keyword pairs found for co-occurrence analysis")

    # Annual keyword trends - corrected version
    # Create year-keyword matrix with explicit column names
    year_keyword_data = []
    for _, row in df.iterrows():
        year = row['year']
        keywords = row['keywords']
        if pd.notna(year) and isinstance(keywords, list):
            for kw in keywords:
                year_keyword_data.append({'year': year, 'keyword_name': kw})  # Use explicit column name

    # Create DataFrame and check its structure
    year_keyword_df = pd.DataFrame(year_keyword_data)
    print(f"Year-keyword DataFrame columns: {year_keyword_df.columns.tolist()}")
    
    if len(year_keyword_df) > 0:
        top_keywords = all_keywords.head(10).index
        
        # Use the correct column name
        yearly_keyword_counts = year_keyword_df[year_keyword_df['keyword_name'].isin(top_keywords)].groupby(['year', 'keyword_name']).size().reset_index(name='count')
        
        # Check result
        print(f"Created yearly keyword counts with shape: {yearly_keyword_counts.shape}")
        
        if len(yearly_keyword_counts) > 0:
            # Create heatmap for keyword trends by year
            fig_heatmap = px.density_heatmap(
                yearly_keyword_counts,
                x='keyword_name',  # Use the correct column name
                y='year',
                z='count',
                title='Keyword Trends by Year',
                labels={'keyword_name': 'Keyword', 'year': 'Year', 'count': 'Frequency'},  # Update labels
                color_continuous_scale='YlGnBu'
            )
            fig_heatmap.update_layout(template='plotly_white')

            # Save and display
            save_and_display(fig_heatmap, 'keyword_trends.html')
        else:
            print("Warning: No data after filtering for top keywords")
    else:
        print("Warning: Year-keyword DataFrame is empty!")
except Exception as e:
    print(f"Error in keyword analysis: {e}")

# 5.2.4 Statistical Analysis
try:
    # Relationship between citation count and publication year
    citation_year_df = df[['year', 'citation_count']].dropna()
    
    if len(citation_year_df) > 0:
        # Calculate correlation
        correlation, p_value = stats.pearsonr(citation_year_df['year'], citation_year_df['citation_count'])
        print(f"Correlation between publication year and citation count: r = {correlation:.2f}, p = {p_value:.4f}")

        # Fit regression model
        X = citation_year_df['year'].values.reshape(-1, 1)
        y = citation_year_df['citation_count'].values
        model = LinearRegression()
        model.fit(X, y)

        # Create scatter plot for citation vs year
        fig_cite_year = px.scatter(
            citation_year_df,
            x='year', 
            y='citation_count',
            opacity=0.6,
            title=f'Citation Count vs. Publication Year (r = {correlation:.2f})',
            labels={'year': 'Publication Year', 'citation_count': 'Citation Count'}
        )

        # Add trend line
        years_sorted = np.sort(citation_year_df['year'].values)
        fig_cite_year.add_trace(
            go.Scatter(
                x=years_sorted, 
                y=model.predict(years_sorted.reshape(-1, 1)), 
                mode='lines', 
                name='Trend Line',
                line=dict(color='red')
            )
        )
        fig_cite_year.update_layout(template='plotly_white')

        # Save and display
        save_and_display(fig_cite_year, 'citation_by_year.html')
    else:
        print("No citation data available for correlation analysis")

    # Relationship between author count and citation count
    try:
        author_citation_df = df[['authors', 'citation_count']].dropna()
        author_citation_df['author_count'] = author_citation_df['authors'].apply(lambda x: len(x) if isinstance(x, list) else 0)
        author_citation_df = author_citation_df[author_citation_df['author_count'] > 0]

        if len(author_citation_df) > 0:
            # Group by author count and calculate average citations
            avg_citations = author_citation_df.groupby('author_count')['citation_count'].mean().reset_index()

            # Create bar chart for citation by author count
            fig_cite_author = px.bar(
                avg_citations,
                x='author_count', 
                y='citation_count',
                title='Average Citation Count by Number of Authors',
                labels={'author_count': 'Number of Authors', 'citation_count': 'Average Citation Count'}
            )
            fig_cite_author.update_layout(template='plotly_white')

            # Save and display
            save_and_display(fig_cite_author, 'citation_by_author_count.html')
        else:
            print("No author citation data available for analysis")
    except Exception as e:
        print(f"Error in author citation analysis: {e}")

    # Relationship between institution count and citation count
    try:
        institution_citation_df = df[['institution_display_names', 'citation_count']].dropna()
        institution_citation_df['institution_count'] = institution_citation_df['institution_display_names'].apply(
            lambda x: len(x) if isinstance(x, list) else 0)
        institution_citation_df = institution_citation_df[institution_citation_df['institution_count'] > 0]

        if len(institution_citation_df) > 0:
            # Group by institution count and calculate average citations
            avg_inst_citations = institution_citation_df.groupby('institution_count')['citation_count'].mean().reset_index()

            # Create bar chart for citation by institution count
            fig_cite_inst = px.bar(
                avg_inst_citations,
                x='institution_count', 
                y='citation_count',
                title='Average Citation Count by Number of Institutions',
                labels={'institution_count': 'Number of Institutions', 'citation_count': 'Average Citation Count'}
            )
            fig_cite_inst.update_layout(template='plotly_white')

            # Save and display
            save_and_display(fig_cite_inst, 'citation_by_institution_count.html')
        else:
            print("No institution citation data available for analysis")
    except Exception as e:
        print(f"Error in institution citation analysis: {e}")
except Exception as e:
    print(f"Error in statistical analysis: {e}")

# 5.2.5 Principal Component Analysis
try:
    # Create institution co-occurrence matrix
    institution_data = df.explode('institution_display_names')['institution_display_names'].value_counts().head(50).index
    institution_paper_matrix = pd.DataFrame(0, index=df.index, columns=institution_data)

    for i, row in df.iterrows():
        institutions = row['institution_display_names']
        if isinstance(institutions, list):
            for inst in institutions:
                if inst in institution_data:
                    institution_paper_matrix.loc[i, inst] = 1

    # Check if we have enough data
    if institution_paper_matrix.sum().sum() > 0:
        # Apply PCA
        pca = PCA(n_components=min(10, len(institution_data)))
        pca_result = pca.fit_transform(institution_paper_matrix)

        # Calculate explained variance
        explained_variance = pca.explained_variance_ratio_
        cumulative_variance = np.cumsum(explained_variance)

        # Create dataframe for plotting
        variance_df = pd.DataFrame({
            'Component': [f'PC{i+1}' for i in range(len(explained_variance))],
            'Individual': explained_variance,
            'Cumulative': cumulative_variance
        })

        # Create bar and line chart for explained variance
        fig_variance = make_subplots(specs=[[{"secondary_y": True}]])

        # Add bars for individual explained variance
        fig_variance.add_trace(
            go.Bar(
                x=variance_df['Component'],
                y=variance_df['Individual'],
                name='Individual Explained Variance'
            ),
            secondary_y=False
        )

        # Add line for cumulative explained variance
        fig_variance.add_trace(
            go.Scatter(
                x=variance_df['Component'],
                y=variance_df['Cumulative'],
                name='Cumulative Explained Variance',
                line=dict(color='red')
            ),
            secondary_y=True
        )

        # Add horizontal line at 95% threshold
        fig_variance.add_trace(
            go.Scatter(
                x=variance_df['Component'],
                y=[0.95] * len(variance_df),
                mode='lines',
                line=dict(dash='dash', color='green'),
                name='95% Threshold'
            ),
            secondary_y=True
        )

        fig_variance.update_layout(
            title='PCA Explained Variance',
            template='plotly_white',
            xaxis_title='Principal Component',
            legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
        )
        fig_variance.update_yaxes(title_text='Individual Explained Variance', secondary_y=False)
        fig_variance.update_yaxes(title_text='Cumulative Explained Variance', secondary_y=True)

        # Save and display
        save_and_display(fig_variance, 'pca_variance.html')

        # Create dataframe for top components visualization
        pca_df = pd.DataFrame(
            pca.components_.T, 
            columns=[f'PC{i+1}' for i in range(pca.n_components_)],
            index=institution_data
        )

        # Scale components for visualization
        pc1_scaled = pca_df['PC1'] * 10
        pc2_scaled = pca_df['PC2'] * 10
        institution_importance = np.sqrt(pc1_scaled**2 + pc2_scaled**2)
        top_institutions = institution_importance.sort_values(ascending=False).head(15).index

        # Create scatter plot for PCA components
        fig_pca = px.scatter(
            x=pca_result[:, 0],
            y=pca_result[:, 1],
            opacity=0.6,
            title='PCA: First Two Principal Components',
            labels={
                'x': f'PC1 ({explained_variance[0]:.2%} variance)',
                'y': f'PC2 ({explained_variance[1]:.2%} variance)'
            }
        )

        # Add annotations for top institutions
        for inst in top_institutions:
            pc1, pc2 = pca_df.loc[inst, 'PC1'], pca_df.loc[inst, 'PC2']
            fig_pca.add_annotation(
                x=pc1*10, 
                y=pc2*10,
                text=inst,
                showarrow=True,
                arrowhead=2,
                arrowsize=1,
                arrowwidth=1,
                ax=20,
                ay=-30
            )

        fig_pca.update_layout(
            template='plotly_white',
            height=700,
            width=900
        )

        # Save and display
        save_and_display(fig_pca, 'pca_components.html')
    else:
        print("Not enough institution data for PCA analysis")
except Exception as e:
    print(f"Error in PCA analysis: {e}")

print("All interactive visualizations have been processed and displayed.")

Directory created successfully or already exists


Successfully saved to output/visualizations/publication_by_year.html
Average annual growth rate of publications: 16.67%


Successfully saved to output/visualizations/publication_trend.html


Successfully saved to output/visualizations/top_institutions.html
No institution pairs found for collaboration analysis


Successfully saved to output/visualizations/top_keywords.html
No keyword pairs found for co-occurrence analysis
Year-keyword DataFrame columns: []
Error in statistical analysis: "['citation_count'] not in index"
Not enough institution data for PCA analysis
All interactive visualizations have been processed and displayed.


#2.WebScraping Data

In [14]:
import pandas as pd
import plotly.express as px
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter

# Read the CSV file and extract the WebScrap portion of the data
df = pd.read_csv("New_And_Original_ResearchOutputs.csv")
df_web = df.iloc[0:652]

# Print basic descriptive statistics for the WebScrap data
print("WebScrap Data Source EDA")
print("Record Count:", len(df_web))
print(df_web.describe(include='all'))

# --- New Analysis for Target Columns ---
# Define target columns for checking True values
target_cols = ["acknowledgments", "data_descriptions", "disclosure_review", "rdc_mentions", "dataset_mentions"]

# Convert target columns to boolean if not already
for col in target_cols:
    df_web[col] = df_web[col].astype(bool)

# Count True values for each of the target columns
true_counts = df_web[target_cols].sum()

# Create a new column 'target_data' which is True if at least one of the target columns is True
df_web["target_data"] = df_web[target_cols].any(axis=1)

# Count the number of records that satisfy the target condition
target_records_count = df_web["target_data"].sum()

# Chart A: Bar chart for True counts of the target columns
true_counts_df = true_counts.reset_index()
true_counts_df.columns = ["Column", "True_Count"]
fig_target = px.bar(true_counts_df, x="Column", y="True_Count", text="True_Count",
                    title="WebScrap: True Counts for Target Columns")
fig_target.update_traces(texttemplate='%{text}', textposition='outside')
fig_target.show()

# Print the number of target records based on target columns condition
print("Number of target records (at least one True among target columns):", target_records_count)

# --- End of New Analysis ---

# Word Cloud for 'Keywords'
keywords_text = " ".join(df_web["Keywords"].dropna().astype(str).tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(keywords_text)
wordcloud_image = wordcloud.to_array()
fig_wc = px.imshow(wordcloud_image, title="WebScrap Keywords Word Cloud")
fig_wc.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)
fig_wc.show()

# Create additional columns: number of authors, title length, and abstract length
df_web["num_authors"] = df_web["authors"].fillna("").apply(lambda x: len([a for a in x.split(";") if a.strip() != ""]))
df_web["title_length"] = df_web["title"].fillna("").apply(lambda x: len(x.split()))
df_web["abstract_length"] = df_web["abstract"].fillna("").apply(lambda x: len(x.split()))

# Chart 1: Histogram of Number of Authors per Article for WebScrap
fig1 = px.histogram(df_web, x="num_authors",
                    title="WebScrap: Distribution of Authors per Article",
                    labels={"num_authors": "Number of Authors"})
fig1.show()

# Chart 2: Top 10 Authors by Article Count for WebScrap
all_authors = []
df_web["authors"].dropna().apply(lambda x: all_authors.extend([a.strip() for a in x.split(";") if a.strip() != ""]))
author_counts = Counter(all_authors)
top_authors = author_counts.most_common(10)
top_authors_df = pd.DataFrame(top_authors, columns=["Author", "Article_Count"])
fig2 = px.bar(top_authors_df, x="Author", y="Article_Count", text="Article_Count",
              title="WebScrap: Top 10 Authors by Article Count")
fig2.update_traces(texttemplate='%{text}', textposition='outside')
fig2.show()

# Chart 3: Scatter Plot of Title Length vs. Abstract Length for WebScrap
fig3 = px.scatter(df_web, x="title_length", y="abstract_length",
                  title="WebScrap: Title Length vs. Abstract Length",
                  labels={"title_length": "Title Length (words)", "abstract_length": "Abstract Length (words)"})
fig3.show()

# Chart 4: Pie Chart for Source Distribution in WebScrap Data
fig_source = px.pie(df_web, names="source", title="WebScrap: Source Distribution")
fig_source.show()

WebScrap Data Source EDA
Record Count: 652
                                                    title  \
count                                                 652   
unique                                                652   
top     The Impact of Geopolitical Conflicts on Trade,...   
freq                                                    1   
mean                                                  NaN   
std                                                   NaN   
min                                                   NaN   
25%                                                   NaN   
50%                                                   NaN   
75%                                                   NaN   
max                                                   NaN   

                                                 abstract  \
count                                                 652   
unique                                                652   
top     Geopolitical conflicts have incre



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Number of target records (at least one True among target columns): 652




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



#3.ResearchOutputs(Original) Data

In [15]:
import pandas as pd
import plotly.express as px
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter

# Read the CSV file and extract the ResearchOutputs portion of the data
df = pd.read_csv("New_And_Original_ResearchOutputs.csv")
df_research = df.iloc[652:2387]  # Expected record count: 1735

# Print basic descriptive statistics for the ResearchOutputs data
print("ResearchOutputs Data Source EDA")
print("Record Count:", len(df_research))
print(df_research.describe(include='all'))

# Word Cloud for 'Keywords'
keywords_text = " ".join(df_research["Keywords"].dropna().astype(str).tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(keywords_text)
wordcloud_image = wordcloud.to_array()
fig_wc = px.imshow(wordcloud_image, title="ResearchOutputs Keywords Word Cloud")
fig_wc.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)
fig_wc.show()

# Create additional columns: number of authors
df_research["num_authors"] = df_research["authors"].fillna("").apply(lambda x: len([a for a in x.split(";") if a.strip() != ""]))

# Chart 1: Histogram of Number of Authors per Article for ResearchOutputs
fig1 = px.histogram(df_research, x="num_authors",
                    title="ResearchOutputs: Distribution of Authors per Article",
                    labels={"num_authors": "Number of Authors"})
fig1.show()

# Chart 2: Top 10 Authors by Article Count for ResearchOutputs
all_authors = []
df_research["authors"].dropna().apply(lambda x: all_authors.extend([a.strip() for a in x.split(";") if a.strip() != ""]))
author_counts = Counter(all_authors)
top_authors = author_counts.most_common(10)
top_authors_df = pd.DataFrame(top_authors, columns=["Author", "Article_Count"])
fig2 = px.bar(top_authors_df, x="Author", y="Article_Count", text="Article_Count",
              title="ResearchOutputs: Top 10 Authors by Article Count")
fig2.update_traces(texttemplate='%{text}', textposition='outside')
fig2.show()

# Chart 3: Histogram of Year Distribution for ResearchOutputs (if 'year' exists)
if "year" in df_research.columns:
    fig3 = px.histogram(df_research, x="year",
                        title="ResearchOutputs: Year Distribution",
                        labels={"year": "Year"})
    fig3.show()

ResearchOutputs Data Source EDA
Record Count: 1735
                               title abstract  \
count                           1735        0   
unique                          1322        0   
top     Firms in International Trade      NaN   
freq                               6      NaN   
mean                             NaN      NaN   
std                              NaN      NaN   
min                              NaN      NaN   
25%                              NaN      NaN   
50%                              NaN      NaN   
75%                              NaN      NaN   
max                              NaN      NaN   

                                   authors source  url acknowledgments  \
count                                 1719      0    0               0   
unique                                1096      0    0               0   
top     Giroud; Xavier;;Holger M. Mueller.    NaN  NaN             NaN   
freq                                    14    NaN  NaN          



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



#4.API Data

In [16]:
import pandas as pd
import plotly.express as px
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter

# Read the CSV file and extract the API portion of the data
df = pd.read_csv("New_And_Original_ResearchOutputs.csv")
df_api = df.iloc[2387:3369]  # Expected record count: 982

# Print basic descriptive statistics for the API data
print("API Data Source EDA")
print("Record Count:", len(df_api))
print(df_api.describe(include='all'))

# Word Cloud for 'Keywords'
keywords_text = " ".join(df_api["Keywords"].dropna().astype(str).tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(keywords_text)
wordcloud_image = wordcloud.to_array()
fig_wc = px.imshow(wordcloud_image, title="API Keywords Word Cloud")
fig_wc.update_xaxes(showticklabels=False).update_yaxes(showticklabels=False)
fig_wc.show()

# --- Note: Project Status Distribution chart removed as per requirement ---

# Create additional columns: number of authors, title length, and abstract length
df_api["num_authors"] = df_api["authors"].fillna("").apply(lambda x: len([a for a in x.split(";") if a.strip() != ""]))
df_api["title_length"] = df_api["title"].fillna("").apply(lambda x: len(x.split()))
df_api["abstract_length"] = df_api["abstract"].fillna("").apply(lambda x: len(x.split()))

# Chart 1: Histogram of Number of Authors per Article for API
fig1 = px.histogram(df_api, x="num_authors",
                    title="API: Distribution of Authors per Article",
                    labels={"num_authors": "Number of Authors"})
fig1.show()

# Chart 2: Top 10 Authors by Article Count for API
all_authors = []
df_api["authors"].dropna().apply(lambda x: all_authors.extend([a.strip() for a in x.split(";") if a.strip() != ""]))
author_counts = Counter(all_authors)
top_authors = author_counts.most_common(10)
top_authors_df = pd.DataFrame(top_authors, columns=["Author", "Article_Count"])
fig2 = px.bar(top_authors_df, x="Author", y="Article_Count", text="Article_Count",
              title="API: Top 10 Authors by Article Count")
fig2.update_traces(texttemplate='%{text}', textposition='outside')
fig2.show()

# Chart 3: Scatter Plot of Title Length vs. Abstract Length for API
fig3 = px.scatter(df_api, x="title_length", y="abstract_length",
                  title="API: Title Length vs. Abstract Length",
                  labels={"title_length": "Title Length (words)", "abstract_length": "Abstract Length (words)"})
fig3.show()

API Data Source EDA
Record Count: 982
                                                    title  \
count                                                 982   
unique                                                982   
top     International Trade and Macroeconomic Dynamics...   
freq                                                    1   
mean                                                  NaN   
std                                                   NaN   
min                                                   NaN   
25%                                                   NaN   
50%                                                   NaN   
75%                                                   NaN   
max                                                   NaN   

                      abstract         authors source  url acknowledgments  \
count                      982             977      0    0               0   
unique                     867             949      0    0               



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

