## Edit and Merge two tables


In [34]:
import pandas as pd

### Read CSV files and drop rows if their title is N/A


In [35]:
df1 = pd.read_csv("assets/projects_info.csv")
df2 = pd.read_csv("assets/taxonomy.csv")


df1 = df1.dropna(subset=["Title"])
df2 = df2.dropna(subset=["Title"])

### If url's has spaces etc., clean them


In [36]:
df1["Github URL"] = df1["Github URL"].str.strip()
df2["Github URL"] = df2["Github URL"].str.strip()

### Decide which columns are required for our project


In [37]:
required_columns = [
    "Title",
    "Paper URL",
    "Specific URL",
    "Github URL",
    "Documentation",
    "Dependencies",
    "Special features",
    "Entry Points (Scripts)",
    "Execution Command",
    "External Credentials",
    "Dataset Dependency",
    "Known Problems",
    "Terminate",
]

### Select columns


In [38]:
df1_final_cols = [c for c in required_columns if c in df1.columns]
df2_final_cols = [c for c in required_columns if c in df2.columns]

In [39]:
df1 = df1[df1_final_cols]
df2 = df2[df2_final_cols]

### Merge two tables and drop duplicated data


In [40]:
df3 = pd.merge(df1, df2, on="Github URL", how="left", suffixes=("", "_drop"))

In [41]:
df3 = df3.drop(columns=[col for col in df3.columns if col.endswith("_drop")])

### Examine latest table


In [42]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Title                   99 non-null     object 
 1   Paper URL               99 non-null     object 
 2   Specific URL            91 non-null     object 
 3   Github URL              99 non-null     object 
 4   Documentation           99 non-null     object 
 5   Dependencies            99 non-null     float64
 6   Special features        99 non-null     object 
 7   Entry Points (Scripts)  99 non-null     object 
 8   Execution Command       99 non-null     object 
 9   External Credentials    45 non-null     object 
 10  Dataset Dependency      93 non-null     object 
 11  Terminate               99 non-null     object 
dtypes: float64(1), object(11)
memory usage: 9.4+ KB


### Save the final output


In [44]:
df3.to_csv("assets/projects.csv", index=False)