# classification model using Anova.

The first function txt_csv is used to convert a text file to a csv file. The function takes two parameters, the path of the text file and the filename of the csv file to be created.

Next, the converted csv file is read using Pandas and any missing values are removed.

The transpose function is used to convert the rows into columns and columns into rows.

The MinMaxScaler function from the scikit-learn library is used to scale the data to a range of 0 to 1.

The 'sample type' column is created by extracting the last two characters from each column name and sorting the data by the 'sample type' column in descending order.

The 'sample type' column is converted to binary values and the index of the data is reset.

The 'X' variable is created by dropping the 'sample type' column from the data, while the 'y' variable is set as the 'sample type' column.

The code uses Anova feature selection to select important features from the input data. 

In [None]:
# Import the library
import csv
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler

# Function to convert text file to CSV format
# When the raw data is in txt format
def txt_csv(path, filename):
    # Open the text file
    with open(path) as in_file:
        # Remove any extra whitespace from each line
        stripped = (line.strip() for line in in_file)
        # Split each line using tab as delimiter and create a list of lists
        lines = (line.split("\t") for line in stripped if line)
        # Create a new CSV file with the given filename and write the list of lists to it
        with open(filename+'.csv', 'w') as out_file:
            writer = csv.writer(out_file)
            writer.writerows(lines)

# Call the txt_csv function to convert the text file to CSV
txt_csv("/content/miR_norm.counts (2).csv", 'omics')

# Read the converted CSV file into a Pandas dataframe
raw_data = pd.read_csv('omics.csv')

# Drop any rows with missing values
raw_data = raw_data.dropna()

# Transpose the dataframe to make the samples the rows and the attributes the columns
data = raw_data.transpose()

# Scale the values of each attribute to be between 0 and 1 using MinMaxScaler
# When the data is not normalized
scaler = MinMaxScaler()
data[list(data.columns.values)] = scaler.fit_transform(data[list(data.columns.values)])

# Extract the sample types from the row names and add them as a new column in the dataframe
sample_type = []
column_names = data.index
for name in column_names[:]:
    sample_type.append(name[13:15])
data['sample type'] = sample_type

# Replace the sample type labels with 0 and 1
data['sample type'] = data['sample type'].replace(['11'],0)
data['sample type'] = data['sample type'].replace(['01'],1)

# Reset the index and remove the column names
data.reset_index(drop=True, inplace=True)
data.columns.name = None

# Separate the features (attributes) and target (sample type) into separate dataframes
X = data.drop("sample type",1)
y = data["sample type"] 

# Creating a SelectKBest object with a score function of f_classif and selecting 250 features
fs = SelectKBest(score_func=f_classif, k=250)
# Applying feature selection to the input features X and target variable y
X_selected = fs.fit_transform(X, y)
# Converting the transformed input features to a DataFrame
anv = pd.DataFrame(X_selected)
# Creating an empty list to store the gene names
genes = []

# Looping over each column of the transformed input features
for j in range(len(anv.columns)):
  # Looping over each feature in the original input features
  for i in tqdm(range(len(X.columns))):
    # Checking if the values in the i-th feature of X are equal to the values in the j-th column of the transformed features
    if list(X.iloc[:,i]) == list(anv.iloc[:,j]):
      # If the values are equal, the name of the i-th feature is added to the list of gene names
      genes.append(X.columns[i])

# Converting the list of gene names to a DataFrame
gene = pd.DataFrame(genes)

# Writing the gene names to a CSV file
gene.to_csv('annova.csv',index=False)