In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer


In [None]:
# Read the data from a CSV file
df = pd.read_excel("C:/Users/p_adi/OneDrive/Desktop/output.xlsx")
df.columns = ['word', 'label']
df['word'] = df['word'].astype(str)
df['label'] = df['label'].astype(str)
df.head()

In [None]:
import json

key_col = 'word'
val_col = 'label'

# convert the dataframe to a dictionary
ref_dict = df.set_index(key_col)[val_col].to_dict()

# Write the dictionary to a JSON file
with open('ref_words.json', 'w') as f:
    json.dump(ref_dict, f)

In [None]:
# Extract features
def extract_features(df, i):
    word = df.loc[i, 'word']
    label = df.loc[i, 'label']
    features = {"word": word, "label": label}
    if i > 0:
        word1 = df.loc[i-1, 'word']
        label1 = df.loc[i-1, 'label']
        features.update({"-1:word": word1, "-1:label": label1})
    else:
        features["BOS"] = True
    if i < len(df)-1:
        word1 = df.loc[i+1, 'word']
        label1 = df.loc[i+1, 'label']
        features.update({"+1:word": word1, "+1:label": label1})
    else:
        features["EOS"] = True
    return features

In [None]:
def my_tokenizer(X):
    newlist = []
    for alist in X:
        newlist.append(alist[0].split(' '))
    return newlist

In [None]:
# Extract features for all words in the dataframe
X = []
y = []
for i in range(len(df)):
    X.append(extract_features(df, i))
    y.append(df.loc[i, 'label'])

# Vectorize the features
vec = DictVectorizer()
X = vec.fit_transform(X)

# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create a pipeline for monogram, bigram and trigram
text_clf = Pipeline([
                ('vect', CountVectorizer(ngram_range=(1,3),lowercase=False)),
                ('clf', Perceptron()),
             ])

# Train the model
text_clf.fit(df['word'],df['label'])

# Test the model
y_pred = text_clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
import pandas as pd
import os
data = pd.read_csv("C:/Users/p_adi/OneDrive/Desktop/output_updated.csv")


# Split the data into training and testing sets
train_data = data.sample(frac=0.8, random_state=1)
test_data = data.drop(train_data.index)

# Extract the words and labels for the training set
train_words = train_data["word"].tolist()
train_labels = train_data["label"].tolist()

# Extract the words and labels for the testing set
test_words = test_data["word"].tolist()
test_labels = test_data["label"].tolist()


In [None]:
train_words

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report

# Convert the words and labels to a list of feature dictionaries
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'word': word,
    }
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

# Convert the data to feature dictionaries
train_features = [sent2features(s) for s in train_words]
train_labels = [sent2labels(s) for s in train_labels]

# Vectorize the feature dictionaries
vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(train_features)

# Create and train the model
clf = Perceptron(verbose=1)
clf.fit(X_train, train_labels)


In [None]:
import pandas as pd

# Read in the data
data = pd.read_csv("C:/Users/p_adi/OneDrive/Desktop/output_updated.csv")

# Split the data into training and testing sets
train_data = data.sample(frac=0.8, random_state=1)
test_data = data.drop(train_data.index)

# Extract the words and labels for the training set
train_words = train_data["word"].tolist()
train_labels = train_data["label"].tolist()

# Extract the words and labels for the testing set
test_words = test_data["word"].tolist()
test_labels = test_data["label"].tolist()

# Convert the data to a list of tuples
train_tuples = [(word, label) for word, label in zip(train_words, train_labels)]
test_tuples = [(word, label) for word, label in zip(test_words, test_labels)]


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Convert the data to a list of feature dictionaries
def word2features(word):
    features = {
        'word': word,
    }
    return features

train_features = [word2features(word) for word, label in train_tuples]

# Vectorize the feature dictionaries
vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(train_features)
y_train = [label for word, label in train_tuples]

# Create and train the model
clf = Pipeline([
    ('vectorizer', DictVectorizer()),
    ('classifier', LogisticRegression())
])
clf.fit(train_features, y_train)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert the data to a list of feature dictionaries
train_features = [word for word, label in train_tuples]
y_train = [label for word, label in train_tuples]

# Vectorize the feature dictionaries
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_features)

# Create and train the model
clf = LogisticRegression(max_iter = 1000, solver = 'sag')
clf.fit(X_train, y_train)


In [None]:
test_features = [word for word, label in test_tuples]
X_test = vectorizer.transform(test_features)
y_test = [label for word, label in test_tuples]
pred_labels = clf.predict(X_test)

# Print the classification report
print(classification_report(y_test, pred_labels))


In [None]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route("/predict", methods=["POST"])
def predict():
    data = request.get_json(force=True)
    words = data["words"]
    features = [{'word': word} for word in words]
    X_test = vectorizer.transform(features)
    y_pred = clf.predict(X_test)
    return jsonify(y_pred)

if __name__ == '__main__':
    app.run()


In [1]:
import pandas as pd
df = pd.read_excel("C:/Users/p_adi/OneDrive/Desktop/output.xlsx")
df.columns = ['word', 'label']
df['word'] = df['word'].astype(str)
df['label'] = df['label'].astype(str)
df.head()

Unnamed: 0,word,label
0,postural hypotension,disease
1,parkinson ' s disease,disease
2,systolic orthostatic hypotension,disease
3,orthostatic hypotension,disease
4,reduced the supine systolic and diastolic bloo...,disease


In [2]:
list1 = list(df['word'])

In [3]:
import random

def shuffle_letters(word):
    f1 = word[0]
    f2 = word[1]
    l1 = word[-2]
    l2 = word[-1]
    middle = list(word[1:-1])
    random.shuffle(middle)
    shuffled = f1 + f2 + ''.join(middle) + l2 + l1
    return shuffled

for i in range(len(list1)):
    list1[i] = shuffle_letters(list1[i])

print(list1)

IndexError: string index out of range

In [1]:
import pandas as pd
import os
os.chdir('C:/Users/p_adi/OneDrive/Desktop')

In [2]:
data1 = pd.read_excel("Noida_OABC.xlsx")
data2 = pd.read_excel("RESULTS_200123.xlsx")
data1.head()

Unnamed: 0,CourseName,NGO Name,Faculty Guide,EnrollmentNo,StudentName,Program,Semester,Batch
0,Human Values and Community Outreach & SW102,Self,Dr Sanjana Sharma Marwaha / Staff Code -9039,A6257421156,Mr REHAN MALIK,B.A. (H) - Political Science,3,2021-2024
1,Human Values and Community Outreach & SW102,Parbhat awakening,Dr Sanjana Sharma Marwaha / Staff Code -9039,A6257421102,Ms SMARNIKA SHARMA,B.A. (H) - Political Science,3,2021-2024
2,Human Values and Community Outreach & SW102,ROBIN HOOD ARMY- DELHI,Dr Sanjana Sharma Marwaha / Staff Code -9039,A6257421109,Ms SYEDA RAHMAT,B.A. (H) - Political Science,3,2021-2024
3,Human Values and Community Outreach & SW102,Mangil NGO,Dr Sanjana Sharma Marwaha / Staff Code -9039,A6257421043,Mr VANGA SWAMY,B.A. (H) - Political Science,3,2021-2024
4,Human Values and Community Outreach & SW102,Sulabh International Social Service Organisation,Dr Aruditya Jasrotia / Staff Code -306195,A1707220024,Mr ANSH DHAWAN,B.A. (TA),5,2020-2023


In [6]:
data2.columns

Index(['CourseName', 'NGO Name', 'Faculty Guide', 'EnrollmentNo',
       'StudentName', 'Program', 'Semester', 'Batch'],
      dtype='object')

In [7]:
data1.columns

Index(['CourseName', 'NGO Name', 'Faculty Guide', 'EnrollmentNo',
       'StudentName', 'Program', 'Semester', 'Batch'],
      dtype='object')

In [8]:
# Get a list of the ids present in the second dataset
ids_to_remove = data2["EnrollmentNo"].tolist()

# Remove the records of ids present in the second dataset from the first dataset
new_data = data1[~data1["EnrollmentNo"].isin(ids_to_remove)]
new_data.shape

(2773, 8)

In [9]:
new_data.to_csv('filtered_record.csv')

In [10]:
new_data.head()

Unnamed: 0,CourseName,NGO Name,Faculty Guide,EnrollmentNo,StudentName,Program,Semester,Batch
0,Human Values and Community Outreach & SW102,Self,Dr Sanjana Sharma Marwaha / Staff Code -9039,A6257421156,Mr REHAN MALIK,B.A. (H) - Political Science,3,2021-2024
9,Human Values and Community Outreach & SW102,Navratan Foundation,Dr Vinita Soni / Staff Code -16714,A0706121079,Mr AYUSH RANA,B.A. (H) - English,3,2021-2024
13,Human Values and Community Outreach & SW102,"Asha Grih , Believers church",Dr Imran Hussain / Staff Code -304488,A015116718100,Ms NIDHI KUMARI,Intg BA - MA (Cli Psy),9,2018-2022
14,Human Values and Community Outreach & SW102,Robin Hood Army,Dr Anna Nath Ganguly / Staff Code -300754,A6257421079,Mr THAIRUNG DAULAGUPHU,B.A. (H) - Political Science,3,2021-2024
15,Human Values and Community Outreach & SW102,Utsaah,Dr Anna Nath Ganguly / Staff Code -300754,A6257421125,Ms SNEHA BHATI,B.A. (H) - Political Science,3,2021-2024
