In [1]:
import pandas as pd
import re

In [None]:
df = pd.read_csv('nadi_pulse_with_text.csv')
df.head()

In [None]:
df = df[['Patient Age','text_content','Admin Comment']]
df.head()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df['Admin Comment'] = df['Admin Comment'].fillna(df['Admin Comment'].mode()[0])

In [None]:
df.isnull().sum()

In [None]:
def clear_text(text):
    text = text[:-25]
    return text

In [None]:
df["text_content"] = df["text_content"].apply(clear_text)

In [None]:
df.head()

In [None]:
admin_values = df["Admin Comment"].to_list()

In [None]:
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
ord_en = OrdinalEncoder()
la = LabelEncoder()

In [None]:
df['Admin Comment'] = ord_en.fit_transform(df[['Admin Comment']])

In [None]:
df.head()

In [None]:
admin_keys = df["Admin Comment"].to_list()

In [None]:
string_to_remove = "Start nPULSE001"
r_text = "Start"

In [None]:
def remove_string_from_content(df, column_name, string_to_remove):
    df["nadi_data"] = df[column_name].apply(lambda x: re.sub(string_to_remove, '', x).strip())
    return df

In [None]:
def remove_start_from_content(df, column_name, r_text):
    df["nadi_data"] = df[column_name].apply(lambda x: re.sub(r_text, '', x).strip())
    return df

In [None]:
df = remove_string_from_content(df, "text_content", string_to_remove)
df = remove_start_from_content(df, "nadi_data", r_text)

In [None]:
df = df[["Patient Age","nadi_data","Admin Comment"]]

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.head()

In [None]:
def process_nadi_data(column):
    def safe_convert_to_int(row):
        # Convert only non-empty strings to integers
        return [int(value) for value in row.split(',') if value.strip().isdigit()]
    
    # Process the column
    processed_data = column.apply(lambda x: [safe_convert_to_int(row) for row in x.strip().split('\n') if row.strip()])
    return processed_data

# Process the nadi_data column
df['processed_nadi_data'] = process_nadi_data(df['nadi_data'])

# Display the processed data
df[['processed_nadi_data']]

In [None]:
df.head()

In [None]:
# Example: Summing all integers in the processed lists for each row
df['sum_of_nadi_data'] = df['processed_nadi_data'].apply(lambda x: sum(sum(sublist) for sublist in x))

# Example: Averaging all integers in the processed lists for each row
df['avg_of_nadi_data'] = df['processed_nadi_data'].apply(
    lambda x: sum(sum(sublist) for sublist in x) / sum(len(sublist) for sublist in x)
)

# # Example: Taking the maximum value from the processed lists for each row
# df['max_of_nadi_data'] = df['processed_nadi_data'].apply(lambda x: max(max(sublist) for sublist in x))

# Display the resulting columns
df[['sum_of_nadi_data', 'avg_of_nadi_data']]

In [None]:
df.head()

In [None]:
new_df = df[["Patient Age","sum_of_nadi_data","Admin Comment"]]
new_df.head()

In [None]:
new_df.info()

In [None]:
import numpy as np
from sklearn.preprocessing import StandardScaler,MinMaxScaler
min_max = MinMaxScaler()
std = StandardScaler()
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier

In [None]:
x = new_df.iloc[:,:-1]
y = new_df["Admin Comment"]

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
y_train

In [None]:

trf = ColumnTransformer([
    ('trf',StandardScaler(),["sum_of_nadi_data"])
],remainder='passthrough')

In [None]:
pipe = Pipeline(steps=[
    ('step1',trf),
    ('step2',RandomForestClassifier())
])

In [None]:
pipe.fit(x_train,y_train)

In [None]:
pipe.score(x_test,y_test)*100

In [None]:
input_data = [[46,24500481]]
pred_df= pd.DataFrame(input_data, columns=['Patient Age' ,'sum_of_nadi_data'])

In [None]:
pipe.predict(pred_df)