# classification model using Lasso regularization.

The first function txt_csv is used to convert a text file to a csv file. The function takes two parameters, the path of the text file and the filename of the csv file to be created.

Next, the converted csv file is read using Pandas and any missing values are removed.

The transpose function is used to convert the rows into columns and columns into rows.

The MinMaxScaler function from the scikit-learn library is used to scale the data to a range of 0 to 1.

The 'sample type' column is created by extracting the last two characters from each column name and sorting the data by the 'sample type' column in descending order.

The 'sample type' column is converted to binary values and the index of the data is reset.

The 'X' variable is created by dropping the 'sample type' column from the data, while the 'y' variable is set as the 'sample type' column.

The code uses L1 regularization (also called Lasso regularization) to select important features from the input data. 

In [7]:
# Import the library
import csv
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler

# Function to convert text file to CSV format
# When the raw data is in txt format
def txt_csv(path, filename):
    # Open the text file
    with open(path) as in_file:
        # Remove any extra whitespace from each line
        stripped = (line.strip() for line in in_file)
        # Split each line using tab as delimiter and create a list of lists
        lines = (line.split("\t") for line in stripped if line)
        # Create a new CSV file with the given filename and write the list of lists to it
        with open(filename+'.csv', 'w') as out_file:
            writer = csv.writer(out_file)
            writer.writerows(lines)

# Call the txt_csv function to convert the text file to CSV
txt_csv("/content/miR_norm.counts (2).csv", 'omics')

# Read the converted CSV file into a Pandas dataframe
raw_data = pd.read_csv('omics.csv')

# Drop any rows with missing values
raw_data = raw_data.dropna()

# Transpose the dataframe to make the samples the rows and the attributes the columns
data = raw_data.transpose()

# Scale the values of each attribute to be between 0 and 1 using MinMaxScaler
# When the data is not normalized
scaler = MinMaxScaler()
data[list(data.columns.values)] = scaler.fit_transform(data[list(data.columns.values)])

# Extract the sample types from the row names and add them as a new column in the dataframe
sample_type = []
column_names = data.index
for name in column_names[:]:
    sample_type.append(name[13:15])
data['sample type'] = sample_type

# Replace the sample type labels with 0 and 1
data['sample type'] = data['sample type'].replace(['11'],0)
data['sample type'] = data['sample type'].replace(['01'],1)

# Reset the index and remove the column names
data.reset_index(drop=True, inplace=True)
data.columns.name = None

# Separate the features (attributes) and target (sample type) into separate dataframes
X = data.drop("sample type",1)
y = data["sample type"] 

# Import the SelectFromModel class from the sklearn.feature_selection module
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
# Fit the SelectFromModel object to the input features X and target variable y
sel_.fit(X, np.ravel(y,order='C'))
# Get the support, i.e., a Boolean mask indicating which features were selected
sel_.get_support()
# Convert X to a Pandas DataFrame
X = pd.DataFrame(X)
# Get the selected features from X using the Boolean mask
selected_feat = X.columns[(sel_.get_support())]

# Print
print('total features: {}'.format((X.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(np.sum(sel_.estimator_.coef_ == 0)))

# Write the selected features to a CSV file called 'lasso.csv'
pd.DataFrame(list(selected_feat)).to_csv('lasso.csv',index=False)

total features: 1881
selected features: 22
features with coefficients shrank to zero: 1859


  data['sample type'] = sample_type
  X = data.drop("sample type",1)
