In [1]:
# Import pandas for data combining
import pandas as pd

In [2]:
# Open the transposable elements data as a dataframe.
te_df = pd.read_table("../../../data/2018_06_12_te_enhancers_ml/test.tsv", header = None)

In [3]:
# Create a data frame to store all the transposable element locations
te_loc_df = pd.DataFrame(columns = ["chr", "start", "end"])
te_loc_df["chr"] = te_df.iloc[:,0]
te_loc_df["start"] = te_df.iloc[:,1]
te_loc_df["end"] = te_df.iloc[:,2]

# Delete repeats from the data frame.
te_new_df = te_loc_df.copy().drop_duplicates()

In [4]:
# Get the set of all transcription factors as column labels.
col_set = set(te_df.iloc[:,8])

# Convert to a list.
col_list = list(col_set)  
    
# Add all the transcription factors as column labels for the new data frame.
for tf in col_list:
    te_new_df[tf] = 0
    
# Create a column with 1 or 0 value for if enhancer overlaps transposable element
te_new_df["enhancer"] = 0

In [5]:
# Iterate through the original dataframe
for row in te_df.itertuples():
    # Chromosome is now in index 1; start location in index 2; end location in index 3 of row.
    # Match chromosome, start location, and end location from the old and new data frames
    # and update the corresponding column of the transcription factor in the new data frame.
    te_new_df.loc[((te_new_df["chr"] == row[1]) & 
                (te_new_df["start"] == row[2]) &
                (te_new_df["end"] == row[3])), [row[9]]] += 1
    
    # Update the enhancer column as needed.
    if row[14] == "1":
        te_new_df.loc[((te_new_df["chr"] == row[1]) & 
                (te_new_df["start"] == row[2]) &
                (te_new_df["end"] == row[3])), "enhancer"] = 1

In [6]:
# Set column for machine learning "y" vector as the last one, so all transcription factors plus
# three columsn for the locations.
end_index = len(col_set) + 3

# Set the machine learning input vector as all columns of transcription factors
x_df = te_new_df.copy().iloc[:,3:end_index]

# Set the machine learning prediction vector as the last column, which tells if enhancer is present
y_df = te_new_df.copy().iloc[:,end_index]

In [7]:
# Import needed libraries from scikit-learn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [8]:
# Create a random forest classifier model
rfc = RandomForestClassifier(n_estimators = 1000, n_jobs = -1)

In [9]:
# Perform 10-fold cross validation on the random forest model
cvs = cross_val_score(rfc, x_df, y_df)
print(cvs)

[0.98963731 0.98958333 0.9947644 ]
