In [None]:
import pandas as pd
import gzip

def vcf_to_csv(vcf_file, output_csv):
    data = []
    
    open_func = gzip.open if vcf_file.endswith(".gz") else open
    with open_func(vcf_file, 'rt') as vcf:
        for line in vcf:
            if line.startswith("#"): 
                continue
            cols = line.strip().split("\t")
            chrom, pos, var_id, ref, alt = cols[:5]  
            info = cols[7]  
            
            gene = "-"
            significance = "-"
            for item in info.split(";"):
                if item.startswith("GENEINFO="):
                    gene = item.split("=")[1].split(":")[0]  
                if item.startswith("CLNSIG="):
                    significance = item.split("=")[1] 
            
            data.append([chrom, pos, var_id, ref, alt, gene, significance])
    
    df = pd.DataFrame(data, columns=["Chromosome", "Position", "ID", "Ref", "Alt", "Gene", "Clinical_Significance"])
    
    df.to_csv(output_csv, index=False)
    print(f"Conversion complete: {output_csv}")

vcf_to_csv(r"C:\Users\aashutosh kumar\Downloads\clinvar.vcf\clinvar.vcf", "clinvar_converted.csv")


Conversion complete: clinvar_converted.csv


In [1]:
import pandas as pd

data = pd.read_csv(r"C:\Projects\Gene-Mutation-Detection\clinvar_converted.csv")
data.head(10)

  data = pd.read_csv(r"C:\Projects\Gene-Mutation-Detection\clinvar_converted.csv")


Unnamed: 0,Chromosome,Position,ID,Ref,Alt,Gene,Clinical_Significance
0,1,66926,3385321,AG,A,OR4F5,Uncertain_significance
1,1,69134,2205837,A,G,OR4F5,Likely_benign
2,1,69314,3205580,T,G,OR4F5,Uncertain_significance
3,1,69423,3205581,G,A,OR4F5,Uncertain_significance
4,1,69581,2252161,C,G,OR4F5,Uncertain_significance
5,1,69682,2396347,G,A,OR4F5,Uncertain_significance
6,1,69731,3205582,T,C,OR4F5,Uncertain_significance
7,1,69769,2288999,T,C,OR4F5,Uncertain_significance
8,1,69995,2351346,G,C,OR4F5,Uncertain_significance
9,1,924518,3388928,G,C,SAMD11,Likely_benign


In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv(r"C:\Users\aashutosh kumar\Downloads\clinvar_converted.csv")
df.drop(columns=["ID"], inplace=True)

categorical_cols = ["Gene", "Ref", "Alt", "Chromosome"]
df[categorical_cols] = df[categorical_cols].astype(str)

encoders = {}
for col in categorical_cols:
    encoders[col] = LabelEncoder()
    df[col] = encoders[col].fit_transform(df[col])

#1 = Pathogenic, 0 = Benign
df["Clinical_Significance"] = df["Clinical_Significance"].apply(lambda x: 1 if "Pathogenic" in x else 0)

  df = pd.read_csv(r"C:\Users\aashutosh kumar\Downloads\clinvar_converted.csv")


In [2]:
df.head(4)

Unnamed: 0,Chromosome,Position,Ref,Alt,Gene,Clinical_Significance
0,0,66926,3845,1,10871,0
1,0,69134,0,8713,10871,0
2,0,69314,24652,8713,10871,0
3,0,69423,16297,1,10871,0


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


X = df.drop(columns=["Clinical_Significance"])
y = df["Clinical_Significance"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))