In [1]:
import pandas as pd
import json
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os

# Load the enhanced summary results
summary_path = '/Users/adamgeorghiou/Desktop/GIM/Project/data/nlp_results/enhanced_nlp_summary.json'
with open(summary_path, 'r') as f:
    summary = json.load(f)

# ------------------------------
# MERGE COMPOSITE KEYS
# ------------------------------
apps = summary['top_applications'].copy()
# Sum the counts for 'composite' and 'graphene composite'
combined_value = apps.get('composite', 0) + apps.get('graphene composite', 0)
apps['composite'] = combined_value
# Remove 'graphene composite' to avoid duplicate
if 'graphene composite' in apps:
    del apps['graphene composite']

# ------------------------------
# MERGE FABRICATION METHOD KEYS
# ------------------------------
fab_methods_original = summary['fabrication_methods'].copy()
# Define groups of synonyms that should be merged
merge_map = {
    'graphene oxide': ['graphene oxide', 'go', 'GO'],
    'reduced graphene oxide': ['reduced graphene oxide', 'rgo', 'RGO', 'rGO'],
    'CVD': ['CVD', 'chemical vapor deposition']
}
# Create a new dictionary to hold merged results
fab_methods_merged = {}
for method, count in fab_methods_original.items():
    method_lower = method.lower()
    found = False
    for target_label, synonyms in merge_map.items():
        if method_lower in [s.lower() for s in synonyms]:
            fab_methods_merged[target_label] = fab_methods_merged.get(target_label, 0) + count
            found = True
            break
    if not found:
        fab_methods_merged[method] = fab_methods_merged.get(method, 0) + count

# ------------------------------
# CREATE SUBPLOTS (3 charts only)
# ------------------------------
# We'll create a 2x2 grid, leaving the bottom-right cell empty.
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        "Top 10 Graphene Applications", 
        "Application Categories", 
        "Fabrication Methods"
    ),
    specs=[
        [{"type": "bar"}, {"type": "pie"}],
        [{"type": "bar"}, None]
    ]
)

# 1. Top Applications Bar Chart (sorted and top 10)
sorted_apps = sorted(apps.items(), key=lambda x: x[1], reverse=True)[:10]
labels = [item[0] for item in sorted_apps]
values = [item[1] for item in sorted_apps]
fig.add_trace(
    go.Bar(
        x=labels,
        y=values,
        marker_color='royalblue'
    ),
    row=1, col=1
)

# 2. Application Categories Pie Chart
categories = summary['application_category_distribution']
fig.add_trace(
    go.Pie(
        labels=list(categories.keys()),
        values=list(categories.values()),
        textinfo='label+percent'
    ),
    row=1, col=2
)

# 3. Fabrication Methods Bar Chart (sorted by count)
sorted_fab = sorted(fab_methods_merged.items(), key=lambda x: x[1], reverse=True)
fab_labels = [item[0] for item in sorted_fab]
fab_values = [item[1] for item in sorted_fab]
fig.add_trace(
    go.Bar(
        x=fab_labels,
        y=fab_values,
        marker_color='lightgreen'
    ),
    row=2, col=1
)

# Update overall layout and size
fig.update_layout(
    height=800,
    width=1200,
    title_text="Graphene Applications Analysis Dashboard",
    showlegend=False
)

# Save the figure as an HTML file
output_dir = '/Users/adamgeorghiou/Desktop/GIM/Project/data/visualizations'
os.makedirs(output_dir, exist_ok=True)
fig.write_html(os.path.join(output_dir, 'graphene_dashboard.html'))

# Show the figure in your browser
import plotly.io as pio
pio.renderers.default = 'browser'
fig.show()

fig.write_image(
    "/Users/adamgeorghiou/Desktop/GIM/Project/data/visualizations/graphene_dashboard_hi_res.png",
    format="png",
    width=2000,
    height=1200,
    scale=2
)



In [6]:
import nbformat
print(nbformat.__version__)

5.10.4


In [3]:
from PIL import Image

img = Image.open("/Users/adamgeorghiou/Desktop/GIM/Project/data/visualizations/graphene_dashboard_hi_res.png")
print(img.size)  # e.g., (4000, 2400) if width=2000, height=1200, scale=2

(4000, 2400)


In [7]:
import json
import pandas as pd
from collections import Counter

# Load the full results file
with open('/Users/adamgeorghiou/Desktop/GIM/Project/data/nlp_results/nlp_results.json', 'r') as f:
    results = json.load(f)

# Extract all application mentions
all_applications = []
for document in results:
    for app in document.get('applications', []):
        all_applications.append(app['application'])

# Get unique applications and their counts
app_counter = Counter(all_applications)
unique_apps = sorted(app_counter.keys())

# Print all unique applications
print(f"Found {len(unique_apps)} unique applications:")
for app in unique_apps:
    print(f"- {app} ({app_counter[app]} mentions)")

# Create a DataFrame for easier analysis
app_df = pd.DataFrame([
    {'application': app, 'count': count} 
    for app, count in app_counter.items()
]).sort_values('count', ascending=False)

# Display the top applications
print("\nTop 20 applications by frequency:")
print(app_df.head(20))

#

Found 82 unique applications:
- additive (360 mentions)
- antibacterial (245 mentions)
- barrier (286 mentions)
- battery (2244 mentions)
- biomedical (243 mentions)
- biosensing (113 mentions)
- biosensor (251 mentions)
- catalyst (1244 mentions)
- cellular (121 mentions)
- chemical conversion (11 mentions)
- circuit (401 mentions)
- coating (1031 mentions)
- composite (5223 mentions)
- conductor (131 mentions)
- display (1090 mentions)
- drug delivery (115 mentions)
- electrocatalyst (136 mentions)
- electrode (2961 mentions)
- electronic (2518 mentions)
- energy harvesting (56 mentions)
- energy storage (715 mentions)
- environmental remediation (63 mentions)
- filter (289 mentions)
- flexible electronics (60 mentions)
- fuel cell (268 mentions)
- gas separation (29 mentions)
- graphene Based Photocatalyst (1 mentions)
- graphene Coating (2 mentions)
- graphene Composite (15 mentions)
- graphene Electrode (1 mentions)
- graphene Electronic (4 mentions)
- graphene Membrane (2 mention

In [None]:
import pandas as pd

# Path to your CSV file
csv_path = "/Users/adamgeorghiou/Desktop/GIM/Project/data/processed/cleaned_graphene_data.csv"

# Load the data
df = pd.read_csv(csv_path)

# Identify rows with missing or blank 'published_date'
missing_date_mask = df['published_date'].isna() | (df['published_date'].astype(str).str.strip() == '')
missing_date_df = df[missing_date_mask]

# Print summary of how many rows are missing dates
print(f"Number of rows missing published_date: {len(missing_date_df)}")

# Group those missing-date rows by source
print("\nSources with missing dates:")
print(missing_date_df['source'].value_counts())

# Optional: display the first few rows with missing dates
missing_date_df.head()


Number of rows missing published_date: 3146

Sources with missing dates:
source
patents    3146
Name: count, dtype: int64


Unnamed: 0,title,abstract,published_date,source,authors
5445,Production of graphene materials directly from...,,,patents,"aruna zhamu, bor z. jang"
5446,"Utilizing Nanoscale Materials as Dispersants, ...",,,patents,mike foley
5447,A kind of non-precious metal nitrogen-doped MO...,,,patents,"王建龙, 王诚, 赵卿"
5448,"Composite, carbon composite including the comp...",,,patents,"hyunjae song, inhyuk son, inyong song, jaeman ..."
5449,Preparation method of nitrogen-doped graphene-...,,,patents,"彭卿, 朱有启, 李亚栋, 李治, 王定胜, 陈晨"
