## Use the VIBRANT output files to count the number of predicted genes
- All VIBRANT annotation files were concatenated 
- annotation files can be found in the vibrant output folder (VIBRANT_annotations*.tsv file)
- Start-stop locations of all predicted proteins were pulled from the predicted protein headers. 
- Protein predictions were done by VIBRANT, using prodigal

In [None]:
# clean the files
# Open the file with the protien start stops
df_ln = pd.read_csv('Gene_pred_all_genomes/230403_combinedphageheader_startstop.tsv', sep='\t')

# remove the > 
df_ln['protein'] = df_ln['protein'].str.replace(r'>', '')

# remove the brackets
df_ln['st_stop'] = df_ln['st_stop'].str.replace(r'(', '')

df_ln['st_stop'] = df_ln['st_stop'].str.replace(r')', '')

df_ln[['start', 'stop']] = df_ln['st_stop'].str.split('.', 1, expand=True)

df_ln['stop'] = df_ln['stop'].str.replace(r'.', '')


In [None]:
# open gene annotation predictions from vibrant, where annotations i
df_genepred = pd.read_csv('Gene_pred_all_genomes/230404_annotations_all.tsv', sep='\t')

In [None]:
# merge gene predictions and gene start stop
df1 = pd.merge(df_ln, df_genepred, on='protein', how='inner')

In [None]:
# Filter based on gene prediction by VOG db
df1['Activity'] = pd.np.where(df1.VOGname.str.contains("tail", case=False), "tail",
                   pd.np.where(df1.VOGname.str.contains("head", case=False), "head",
                   pd.np.where(df1.VOGname.str.contains("hypothetical protein", case=False), "hypothetical",
                   pd.np.where(df1.VOGname.str.contains("terminase", case=False), "terminase",
                   pd.np.where(df1.VOGname.str.contains("capsid", case=False), "capsid", "other")))))

In [None]:
# Count number of times words have been found
df1['Activity'].value_counts().to_frame()

In [None]:
# Add Crispr data and look at difference
df_crispr = pd.read_csv('../spacers_vOTUs_crass.tsv',sep='\t')

# merge with annotation data
df = pd.merge(df1, df_crispr, on='scaffold', how='inner')

# change start stop locations to numbers instead of strings
df["start"] = pd.to_numeric(df["start"])
df["stop"] = pd.to_numeric(df["stop"])


# Compare start stop positions of genes with the crispr start stop
# Keep protein prediction if gene falls within the crispr region
df = df[(df['start']>=df['sstart']) & (df['stop']<=df['send']) | 
        (df['start']<=df['sstart']) & (df['stop']>=df['send'])]

In [None]:
# again count the VOG predictions
df = df['Activity'].value_counts().to_frame()
