In [1]:
import os
import pandas as pd
import re

In [None]:
# Step 1: Set working directory
directory = r"D:\PhD\RA\Schafer\IRA\data"

# Step 2: Get all Excel files in the directory
files = [f for f in os.listdir(directory) if f.endswith('.xlsx')]

# Step 3: Read and append all files
all_data = []
for file in files:
    file_path = os.path.join(directory, file)
    df = pd.read_excel(file_path)
    all_data.append(df)

# Step 4: Merge all data into a single DataFrame
combined_df = pd.concat(all_data, ignore_index=True)

# Step 5: Save the merged dataset before deduplication
merged_file = os.path.join(directory, "merged_all_data.xlsx")
combined_df.to_excel(merged_file, index=False)
print(f"Saved merged dataset: {merged_file}")

Saved merged dataset: D:\PhD\RA\Schafer\IRA\data\merged_all_data.xlsx


In [None]:
# Step 6: Remove duplicate Drug IDs (keeping the first occurrence)
deduplicated_df = combined_df.drop_duplicates(subset=['Citeline Drug ID'], keep='first')

# Step 7: Save the final deduplicated dataset
final_file = os.path.join(directory, "final_deduplicated_data.xlsx")
deduplicated_df.to_excel(final_file, index=False)
print(f"Saved final deduplicated dataset: {final_file}")

In [9]:
# Define the directory where the files are located
directory = r"D:\PhD\RA\Schafer\IRA\data"

# Define the file names
file_name_20200616 = "20200616.xlsx"
file_name_final_deduplicated = "final_deduplicated_data.xlsx"

# Combine directory and file names to create full file paths
file_path_20200616 = os.path.join(directory, file_name_20200616)
file_path_final_deduplicated = os.path.join(directory, file_name_final_deduplicated)

# Load the two Excel files
df_20200616 = pd.read_excel(file_path_20200616)
df_final_deduplicated = pd.read_excel(file_path_final_deduplicated)

# Extract the Citeline Drug IDs from the final_deduplicated_data.xlsx
drug_ids_to_remove = set(df_final_deduplicated['Citeline Drug ID'])

# Filter rows in 20200616.xlsx to keep only those where Drug ID is NOT in drug_ids_to_remove
filtered_df = df_20200616[~df_20200616['drugid'].isin(drug_ids_to_remove)]

# Save the updated DataFrame back to a new file in the same directory
output_file_name = "20200616_filtered.xlsx"
output_file_path = os.path.join(directory, output_file_name)
filtered_df.to_excel(output_file_path, index=False)


## split the new data 

In [13]:
import os
import pandas as pd
import re

# Step 1: Set working directory and load file
directory = r"D:\PhD\RA\Schafer\IRA\data\merged"
file_name = "final_deduplicated_data.xlsx"
file_path = os.path.join(directory, file_name)
df = pd.read_excel(file_path)

active_statuses = [
    'Launched',
    'Phase I Clinical Trial',
    'Phase II Clinical Trial',
    'Phase III Clinical Trial',
    'Preclinical',
    'Pre-registration',
    'No Development Reported'
    'Registered'
]

# Filter only drugs with continued development
df = df[df['Global Status'].isin(active_statuses)].copy()

# Replace NaN values with empty strings in the event columns
df['Event Date'] = df['Event Date'].fillna('').astype(str)
df['Event Type'] = df['Event Type'].fillna('').astype(str)
df['Event Details'] = df['Event Details'].fillna('').astype(str)

# Split the Event Date, Event Type, and Event Details columns by newline (\n)
df['Event Date'] = df['Event Date'].apply(lambda x: re.split(r'\r?\n', x))
df['Event Type'] = df['Event Type'].apply(lambda x: re.split(r'\r?\n', x))
df['Event Details'] = df['Event Details'].apply(lambda x: re.split(r'\r?\n', x))

# Create a new DataFrame to store the transformed data
transformed_data = []

# Create a list to track problematic Citeline Drug IDs
problematic_ids = []

# Iterate through each row in the original DataFrame
for index, row in df.iterrows():
    # Get the lists of Event Date, Event Type, and Event Details
    event_dates = row['Event Date']
    event_types = row['Event Type']
    event_details = row['Event Details']

    # Check for length mismatch
    if not (len(event_dates) == len(event_types) == len(event_details)):
        problematic_ids.append(row['Citeline Drug ID'])
        continue  # Skip this row or optionally handle it with padding

    # Create a new row for each event
    for i in range(len(event_dates)):
        new_row = row.copy()
        new_row['Event Date'] = event_dates[i]
        new_row['Event Type'] = event_types[i]
        new_row['Event Details'] = event_details[i]
        transformed_data.append(new_row)

# Convert the transformed data into a DataFrame
transformed_df = pd.DataFrame(transformed_data)

# Reset index
transformed_df = transformed_df.reset_index(drop=True)

# Ensure output directory exists
output_dir = "D:/PhD/RA/Schafer/IRA/data/transformed"
os.makedirs(output_dir, exist_ok=True)

# Save transformed data
output_file_path = os.path.join(output_dir, "transformed_data2.xlsx")
transformed_df.to_excel(output_file_path, index=False)

print("Transformation complete. Data saved to:", output_file_path)

# Report problematic rows
if problematic_ids:
    print("Mismatched event counts found for the following Citeline Drug IDs:")
    for pid in problematic_ids:
        print(pid)

    # Optionally save to Excel
    pd.Series(problematic_ids, name='Citeline Drug ID').to_excel(
        os.path.join(output_dir, "problematic_ids.xlsx"), index=False)
else:
    print("No mismatched rows found.")

Transformation complete. Data saved to: D:/PhD/RA/Schafer/IRA/data/transformed\transformed_data2.xlsx
Mismatched event counts found for the following Citeline Drug IDs:
71628
245454
134350
183089
64310
70750
163715
75097
292020
147997
34786
290588
291325
33657
89109
69763
160029
38368
114476
42822
125985
126134
178511
117921
31975
89883
85446
86817
36618
25659
43357
111159
90489
169855
77738
80245
141310
113914
210821
202873
200445
68558
62575
29902
111767
120563
132540
139971
91363
72046
68258
28702
28985
38731
131769
187948
252268
36868
13845
15730
247660
151454
3006
278401
37602
27851
247926
14550
82214
34517
235987
10425
115136
222088
42566
16453
23004
123484
33394
150991
229120
74346
219976
239192
163168
61742
74375
190198
208388
253298
238338
202966
155809
129733
35280
68233
17529
122315
148635
230043
25985
19577
190191
148554
214359
81511
32154
226842
229824
28869
197219
187440
174497
153833
36934
161153
203999
32854
167463
35510
85560
199556
63865
179771
42169
187945
81831
7616

In [14]:
import os
import re
import pandas as pd

# === Step 1: Setup paths ===
input_path = r"D:\PhD\RA\Schafer\IRA\data\merged\final_deduplicated_data.xlsx"
output_dir = r"D:\PhD\RA\Schafer\IRA\data\transformed"
output_fixed = os.path.join(output_dir, "fixed_problematic_ids.xlsx")
output_unfixed = os.path.join(output_dir, "still_unfixed_problematic_ids.xlsx")

# === Step 2: Load data ===
df = pd.read_excel(input_path)

# === Step 3: Clean and prepare ===
for col in ['Event Date', 'Event Type', 'Event Details']:
    df[col] = df[col].fillna('').astype(str)

# === Step 4: Load previously saved problematic IDs ===
# Replace [...] with the actual list if not already in memory

problematic_ids_path = os.path.join(output_dir, "problematic_ids.xlsx")
problematic_ids = pd.read_excel(problematic_ids_path)['Citeline Drug ID'].tolist()

# Filter the original dataset to only those rows
df_problematic = df[df['Citeline Drug ID'].isin(problematic_ids)].copy()

# === Step 5: Smart split for Event Details only ===
def smart_split_details(text):
    lines = re.split(r'\r?\n', text)
    merged = []
    for line in lines:
        if not line.strip():
            merged.append('')
            continue
        if merged and line[0].islower() and merged[-1] != '':
            merged[-1] += ' ' + line.strip()
        else:
            merged.append(line.strip())
    return merged

# === Step 6: Process each row ===
fixed_data = []
still_problematic = []

for _, row in df_problematic.iterrows():
    event_dates = re.split(r'\r?\n', row['Event Date'])
    event_types = re.split(r'\r?\n', row['Event Type'])
    event_details = smart_split_details(row['Event Details'])

    if len(event_dates) == len(event_types) == len(event_details):
        for i in range(len(event_dates)):
            new_row = row.copy()
            new_row['Event Date'] = event_dates[i]
            new_row['Event Type'] = event_types[i]
            new_row['Event Details'] = event_details[i]
            fixed_data.append(new_row)
    else:
        still_problematic.append({
            'Citeline Drug ID': row['Citeline Drug ID'],
            'Event Date Count': len(event_dates),
            'Event Type Count': len(event_types),
            'Event Detail Count': len(event_details)
        })

# === Step 7: Save results ===
os.makedirs(output_dir, exist_ok=True)

# Fixed rows
fixed_df = pd.DataFrame(fixed_data).reset_index(drop=True)
fixed_df.to_excel(output_fixed, index=False)

# Remaining problematic rows
still_problematic_df = pd.DataFrame(still_problematic)
still_problematic_df.to_excel(output_unfixed, index=False)

print(f"✅ Fixed rows saved to: {output_fixed}")
print(f"⚠️ Still unmatched rows saved to: {output_unfixed}")

✅ Fixed rows saved to: D:\PhD\RA\Schafer\IRA\data\transformed\fixed_problematic_ids.xlsx
⚠️ Still unmatched rows saved to: D:\PhD\RA\Schafer\IRA\data\transformed\still_unfixed_problematic_ids.xlsx


In [2]:
import os
import re
import pandas as pd

# === Step 1: Define paths ===
base_dir = r"F:\PhD\RA\Schafer\IRA\data"
input_unfixed_path = os.path.join(base_dir, "transformed", "still_unfixed_problematic_ids.xlsx")
original_data_path = os.path.join(base_dir, "merged", "final_deduplicated_data.xlsx")
output_expanded = os.path.join(base_dir, "transformed", "expanded_unfixed_fixed.xlsx")

# === Step 2: Load still unmatched IDs and original dataset ===
still_unfixed_df = pd.read_excel(input_unfixed_path)
unfixed_ids = still_unfixed_df['Citeline Drug ID'].tolist()

df_original = pd.read_excel(original_data_path)
for col in ['Event Date', 'Event Type', 'Event Details']:
    df_original[col] = df_original[col].fillna('').astype(str)

# === Step 3: Filter original dataset for those IDs ===
df_unfixed = df_original[df_original['Citeline Drug ID'].isin(unfixed_ids)].copy()

# === Step 4: Smart split logic ===
def smart_split_details(text):
    lines = re.split(r'\r?\n', text)
    merged = []
    for line in lines:
        if not line.strip():
            merged.append('')
            continue
        if merged and line[0].islower() and merged[-1] != '':
            merged[-1] += ' ' + line.strip()
        else:
            merged.append(line.strip())
    return merged

# === Step 5: Expand each row by max length ===
expanded_rows = []

for _, row in df_unfixed.iterrows():
    dates = re.split(r'\r?\n', row['Event Date'])
    types = re.split(r'\r?\n', row['Event Type'])
    details = smart_split_details(row['Event Details'])

    max_len = max(len(dates), len(types), len(details))
    dates += [''] * (max_len - len(dates))
    types += [''] * (max_len - len(types))
    details += [''] * (max_len - len(details))

    for i in range(max_len):
        new_row = row.copy()
        new_row['Event Date'] = dates[i]
        new_row['Event Type'] = types[i]
        new_row['Event Details'] = details[i]
        expanded_rows.append(new_row)

# === Step 6: Save expanded rows ===
expanded_df = pd.DataFrame(expanded_rows).reset_index(drop=True)
expanded_df.to_excel(output_expanded, index=False)

print(f"📤 Expanded problematic rows saved to: {output_expanded}")

📤 Expanded problematic rows saved to: F:\PhD\RA\Schafer\IRA\data\transformed\expanded_unfixed_fixed.xlsx


## append

In [2]:
import pandas as pd
import os

# === Step 1: Define file paths ===
base_dir = r"F:\PhD\RA\Schafer\IRA\data\transformed"
files = {
    "transformed": os.path.join(base_dir, "transformed_data2.xlsx"),
    "fixed": os.path.join(base_dir, "fixed_problematic_ids.xlsx"),
    "expanded": os.path.join(base_dir, "expanded_unfixed_fixed_man.xlsx")
}

# === Step 2: Load the files ===
dfs = {name: pd.read_excel(path) for name, path in files.items()}

# === Step 3: Compare column names ===
all_columns = {name: set(df.columns) for name, df in dfs.items()}
common_columns = set.intersection(*all_columns.values())
all_columns_union = set.union(*all_columns.values())

# Identify non-common columns per dataset
column_differences = {
    name: sorted(cols - common_columns) for name, cols in all_columns.items()
}

# Print non-matching columns for review
print("🧾 Columns NOT common across all files:")
for name, diffs in column_differences.items():
    print(f"- {name}: {diffs}")

# === Step 4: Check unique Citeline Drug ID intersections ===
unique_ids = {
    name: set(df["Citeline Drug ID"].dropna().unique()) for name, df in dfs.items()
}

# Compute intersections
intersect_transformed_fixed = unique_ids["transformed"] & unique_ids["fixed"]
intersect_transformed_expanded = unique_ids["transformed"] & unique_ids["expanded"]
intersect_fixed_expanded = unique_ids["fixed"] & unique_ids["expanded"]

print("🧾 Unique Citeline Drug ID Intersections:")
print(f"- Transformed ∩ Fixed: {len(intersect_transformed_fixed)} IDs")
print(f"- Transformed ∩ Expanded: {len(intersect_transformed_expanded)} IDs")
print(f"- Fixed ∩ Expanded: {len(intersect_fixed_expanded)} IDs")

# Optional: Uncomment to see exact intersecting IDs
print("Intersecting IDs (transformed & fixed):", intersect_transformed_fixed)
print("Intersecting IDs (transformed & expanded):", intersect_transformed_expanded)
print("Intersecting IDs (fixed & expanded):", intersect_fixed_expanded)

🧾 Columns NOT common across all files:
- transformed: []
- fixed: []
- expanded: []
🧾 Unique Citeline Drug ID Intersections:
- Transformed ∩ Fixed: 0 IDs
- Transformed ∩ Expanded: 0 IDs
- Fixed ∩ Expanded: 0 IDs
Intersecting IDs (transformed & fixed): set()
Intersecting IDs (transformed & expanded): set()
Intersecting IDs (fixed & expanded): set()


In [5]:
import pandas as pd
import os

# === Step 1: Define file paths ===
base_dir = r"F:\PhD\RA\Schafer\IRA\data\transformed"
file_list = [
    "transformed_data2.xlsx",
    "fixed_problematic_ids.xlsx",
    "expanded_unfixed_fixed_man.xlsx"
]

# === Step 2: Load and combine ===
dfs = [pd.read_excel(os.path.join(base_dir, file)) for file in file_list]
combined_df = pd.concat(dfs, ignore_index=True)

# === Step 3: Save result ===
output_path = os.path.join(base_dir, "combined_2_cleaned.xlsx")
combined_df.to_excel(output_path, index=False)
print(f"✅ Combined dataset saved to: {output_path}")

✅ Combined dataset saved to: F:\PhD\RA\Schafer\IRA\data\transformed\combined_2_cleaned.xlsx


## Check

In [8]:
import pandas as pd
import os
import re

# Load the Excel file
file_path = r"D:\PhD\RA\Schafer\IRA\data\merged\final_deduplicated_data.xlsx"
df = pd.read_excel(file_path)

# Replace NaN with empty strings and ensure string type
for col in ['Event Date', 'Event Type', 'Event Details']:
    df[col] = df[col].fillna('').astype(str)

# Filter for specific drug ID
drug_id = 77572
row = df[df['Citeline Drug ID'] == drug_id]

if row.empty:
    print(f"No record found for Citeline Drug ID {drug_id}")
else:
    row = row.iloc[0]  # Get the row as a Series

    # Split on line breaks robustly (handles \n or \r\n)
    event_dates = re.split(r'\r?\n', row['Event Date'])
    event_types = re.split(r'\r?\n', row['Event Type'])
    event_details = re.split(r'\r?\n', row['Event Details'])

    # Determine the maximum length
    max_len = max(len(event_dates), len(event_types), len(event_details))

    # Pad all lists to the same length with ''
    event_dates += [''] * (max_len - len(event_dates))
    event_types += [''] * (max_len - len(event_types))
    event_details += [''] * (max_len - len(event_details))

    # Create new rows
    parsed_rows = []
    for i in range(max_len):
        new_row = row.copy()
        new_row['Event Date'] = event_dates[i]
        new_row['Event Type'] = event_types[i]
        new_row['Event Details'] = event_details[i]
        parsed_rows.append(new_row)

    # Convert to DataFrame
    parsed_df = pd.DataFrame(parsed_rows).reset_index(drop=True)

    # Output path
    output_path = r"D:\PhD\RA\Schafer\IRA\data\transformed\drug_77572_expanded.xlsx"
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Save to Excel
    parsed_df.to_excel(output_path, index=False)
    print(f"Parsed data for Citeline Drug ID {drug_id} saved to:\n{output_path}")

Parsed data for Citeline Drug ID 77572 saved to:
D:\PhD\RA\Schafer\IRA\data\transformed\drug_77572_expanded.xlsx


## splite the old data

In [17]:
# Define the directory where the files are located
directory = r"D:\PhD\RA\Schafer\IRA\data"

# Define the file name
file_name = "20200616_filtered.xlsx"

# Combine directory and file name to create the full file path
file_path = os.path.join(directory, file_name)

# Load the data from the Excel file
df = pd.read_excel(file_path)

# Create a new DataFrame to store the transformed data
transformed_data = []

# Iterate through each row in the original DataFrame
for index, row in df.iterrows():
    keyevents = row.get('keyevents', '')
    
    # Skip rows where keyevents is NaN or empty
    if pd.isna(keyevents) or keyevents.strip() == "":
        continue
    
    # Adjusted regex to handle semicolons in event details
    pattern = r'keyEventDate:(.*?);(?:\s*keyEventHistory:(.*?);)?(?:\s*keyEventDetail:(.*?)(?=\s*keyEventDate:|\s*keyEventHistory:|\s*keyEventDetail:|$))?'
    matches = re.findall(pattern, keyevents, re.DOTALL)
    
    for match in matches:
        event_date = match[0].strip() if match[0] else ""
        event_type = match[1].strip() if match[1] else ""  # Can be empty
        event_details = match[2].strip() if match[2] else ""  # Can be empty
        
        # Copy the row and replace with extracted event data
        new_row = row.copy()
        new_row['Event Date'] = event_date
        new_row['Event Type'] = event_type
        new_row['Event Details'] = event_details  # This will now keep everything correctly
        
        # Append to transformed data
        transformed_data.append(new_row)

# Convert to DataFrame
transformed_df = pd.DataFrame(transformed_data)

# Reset index
transformed_df = transformed_df.reset_index(drop=True)

# Define output file path in the desired directory
output_file_name = "transformed_data1.xlsx"
output_file_path = os.path.join(directory, output_file_name)

# Save the transformed data to a new Excel file
transformed_df.to_excel(output_file_path, index=False)

print(f"Transformation complete. Data saved to: {output_file_path}")

Transformation complete. Data saved to: D:\PhD\RA\Schafer\IRA\data\transformed_data1.xlsx
