# Systematically Merge Environmental and Genome Sequence Data
Combine extracted_sequences.csv and delhi_env_1960_2024.csv to produce combined_dataset.csv containing:

- Accession
- Collection_Year
- Combined_Sequence
- Avg_Temp_C
- CO2_Emission_Mt

### Import required libraries

In [1]:
import pandas as pd

### Load data

In [2]:
# Extracted sequences
seq_df = pd.read_csv("../data/extracted_sequences.csv")

# Delhi environmental data
env_df = pd.read_csv("../data/delhi_env_1960_2024.csv")

In [3]:
seq_df.head(5)

Unnamed: 0,Accession,Collection_Year,Combined_Sequence
0,PV554916.1,2024.0,GACAAAGACAGATTCTTTGAGGAAGCTAAGCTTAACGTAGTTCTAA...
1,PV554917.1,2024.0,GACAAAGACAGATTCTTTGAGGAAGCTAAGCTTAACGTAGTTCTAA...
2,PV554918.1,2024.0,CCGACAAAGACAGATTCTTTGAGGAAGCTAAGCTTAACGTAGTTCT...
3,PV554919.1,2024.0,GACAAGAACAGTTTCGACTCGGAAGCTTGCTTAACGTAGTGCTAAC...
4,PV554920.1,2024.0,GACAAAGACAGATTCTTTGAGGAAGCTAAGCTTAACGTAGTTCTAA...


In [4]:
env_df.head(5)

Unnamed: 0,Year,Avg_Temp_C,CO2_Emission_Mt
0,1960,24.1,147.1
1,1961,24.0,155.7
2,1962,24.19,214.4
3,1963,24.4,238.8
4,1964,24.08,255.8


### Prepare columns for merging

In [5]:
# Drop rows with missing Collection_Year
seq_df = seq_df.dropna(subset=["Collection_Year"])

# Convert Collection_Year to int
seq_df["Collection_Year"] = seq_df["Collection_Year"].astype(int)

# Rename for merge compatibility
env_df = env_df.rename(columns={"Year": "Collection_Year"})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seq_df["Collection_Year"] = seq_df["Collection_Year"].astype(int)


In [6]:
seq_df.head(5)

Unnamed: 0,Accession,Collection_Year,Combined_Sequence
0,PV554916.1,2024,GACAAAGACAGATTCTTTGAGGAAGCTAAGCTTAACGTAGTTCTAA...
1,PV554917.1,2024,GACAAAGACAGATTCTTTGAGGAAGCTAAGCTTAACGTAGTTCTAA...
2,PV554918.1,2024,CCGACAAAGACAGATTCTTTGAGGAAGCTAAGCTTAACGTAGTTCT...
3,PV554919.1,2024,GACAAGAACAGTTTCGACTCGGAAGCTTGCTTAACGTAGTGCTAAC...
4,PV554920.1,2024,GACAAAGACAGATTCTTTGAGGAAGCTAAGCTTAACGTAGTTCTAA...


In [7]:
env_df.head(5)

Unnamed: 0,Collection_Year,Avg_Temp_C,CO2_Emission_Mt
0,1960,24.1,147.1
1,1961,24.0,155.7
2,1962,24.19,214.4
3,1963,24.4,238.8
4,1964,24.08,255.8


### Merge datasets

In [8]:
merged_df = pd.merge(seq_df, env_df, on="Collection_Year", how="inner")
print(f"Merged dataset shape: {merged_df.shape}")
merged_df.head()

Merged dataset shape: (395, 5)


Unnamed: 0,Accession,Collection_Year,Combined_Sequence,Avg_Temp_C,CO2_Emission_Mt
0,PV554916.1,2024,GACAAAGACAGATTCTTTGAGGAAGCTAAGCTTAACGTAGTTCTAA...,26.16,2489.9
1,PV554917.1,2024,GACAAAGACAGATTCTTTGAGGAAGCTAAGCTTAACGTAGTTCTAA...,26.16,2489.9
2,PV554918.1,2024,CCGACAAAGACAGATTCTTTGAGGAAGCTAAGCTTAACGTAGTTCT...,26.16,2489.9
3,PV554919.1,2024,GACAAGAACAGTTTCGACTCGGAAGCTTGCTTAACGTAGTGCTAAC...,26.16,2489.9
4,PV554920.1,2024,GACAAAGACAGATTCTTTGAGGAAGCTAAGCTTAACGTAGTTCTAA...,26.16,2489.9


### Save combined dataset

In [9]:
merged_df.to_csv("../data/combined_dataset.csv", index=False)
print("Combined dataset saved to ../data/combined_dataset.csv")

Combined dataset saved to ../data/combined_dataset.csv
