In [1]:
# Import necessary libraries
import numpy as np 
import pandas as pd
import nltk #used for natural language processing
from sklearn.feature_extraction.text import TfidfVectorizer #used for feature extraxtion from text
from sklearn.model_selection import train_test_split #used to divide the data into training and testing
#models we will apply for classification
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [2]:
#import the data 
data=pd.read_csv('immverse_ai_eval_dataset (2)1.csv')
data.head(3)

Unnamed: 0,id,sentence,voice
0,1,The chef prepares the meal.,Active
1,2,The teacher explains the lesson clearly.,Active
2,3,The gardener waters the plants every morning.,Active


In [3]:
#map the voice 0 as active and 1 as passive
data['voice']=data['voice'].map({'Active': 0, 'Passive': 1})
data['voice'].head()

0    0
1    0
2    0
3    0
4    0
Name: voice, dtype: int64

In [4]:
#import the necessary libraries for preprocessing of the text sentence
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

In [5]:
#function For preprocessing
def pre_process(text) :
    #tokenize the text received
    tokens = word_tokenize(text)
    #Remove Stopwords
    stop_words=set(stopwords.words('english'))
    filtered_tokens=[] #list which stores the tokens after the stop word removal
    for word in tokens:
        if word.lower() not in stop_words and word not in string.punctuation :
            filtered_tokens.append(word)
    #stemming of the word tokens using PorterStemmer()
    stemmer=PorterStemmer()
    stemmed_tokens=[] #list which stores the word tokens after the stemming process
    for word in filtered_tokens:
        stemmed_tokens.append(stemmer.stem(word))
    #finaly return the stemmed token as a single text or sentence  
    return (' '.join(stemmed_tokens))

In [6]:
#Divding the data into train(60%), Test (20%), and validation(20%)
#We will divde the training data and test+validation data first
#after we will split the training and validation data
from sklearn.model_selection import train_test_split
X_train,temp_data,y_train,temp_label=train_test_split(data['sentence'],data['voice'],test_size=0.4,random_state=40)
X_test,X_valid,y_test,y_valid=train_test_split(temp_data,temp_label,test_size=0.5)

In [7]:
#apply the prerocessing steps to train test and validation data
X_train=X_train.apply(pre_process)
X_test=X_test.apply(pre_process)
X_valid=X_valid.apply(pre_process)

In [8]:
#lets check the training data after the preprocessing and stop word removal
#apply the prerocessing steps to train test and validation data
X_train.head()

32          modern dress creat design
30    film shot variou locat director
14               programm code applic
15           architect draw plan hous
20                   meal prepar chef
Name: sentence, dtype: object

In [9]:
#After the data is preprocessed we can apply the feature extraction to extract the features from our corpus 
#as vectores after we can use these vectors as numarical features to train our model to classify the sentences
vectorizer=TfidfVectorizer()

In [10]:
X_train=vectorizer.fit_transform(X_train)
X_test=vectorizer.transform(X_test)
X_valid=vectorizer.transform(X_valid)

In [11]:
#lets check the score for diffrent models we have by Hyperparameter tuning
#for this we will use GridSearchCV() the GridSearchCV() will train each model on our pre processed data 
#and will return the score for each model or the model with highest accureacy(best suitale model)
#let's create a following dictionry of models and thier parameters 
model_params = {
    'svm': {
        'model': SVC(),
        'params' : {
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(random_state=42),
        'params' : {
            'n_estimators': [5,10,15,100]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(),
        'params': {
        }
    },
    'KNN' : {
        'model' :  KNeighborsClassifier(),
        'params' : {
            'n_neighbors' :[2,3,4,5,10,7]
        }
    },
    'Decision_Tree' :{
        'model' : DecisionTreeClassifier(),
    'params':{
        
    }
                     }
    
}

In [13]:
#import GridSearchCV
from sklearn.model_selection import GridSearchCV
scores=[] #list which stores the models with resective bestscore and parameters
for model_names,mp in model_params.items() :
    mod =  GridSearchCV(mp['model'], mp['params'], cv=4, return_train_score=False)
    mod.fit(X_valid, y_valid)
    scores.append({
        'model': model_names,
        'best_score': mod.best_score_,
        'best_params': mod.best_params_
    })
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Traceback (most recent call last):
  File "C:\Users\adity\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\adity\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\metrics\_scorer.py", line 444, in _passthrough_scorer
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\adity\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 668, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "C:\Users\adity\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\neighbors\_classification.py", line 234, in predict
    neigh_ind = self.kneighbors(X, return_distance=False)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  F

Unnamed: 0,model,best_score,best_params
0,svm,0.5,{'kernel': 'rbf'}
1,random_forest,0.5,{'n_estimators': 5}
2,logistic_regression,0.5,{}
3,KNN,0.5,{'n_neighbors': 2}
4,Decision_Tree,0.5,{}


As the data is so small the validation set is also very less thus we can use the hypertuning directly on the tarining data for the best results

In [14]:
#using hypertuning on training data 
from sklearn.model_selection import GridSearchCV
scores1=[] #list which stores the models with resective bestscore and parameters
for model_names,mp in model_params.items() :
    mod =  GridSearchCV(mp['model'], mp['params'], cv=4, return_train_score=False)
    mod.fit(X_train, y_train) #directly using training data
    scores1.append({
        'model': model_names,
        'best_score': mod.best_score_,
        'best_params': mod.best_params_
    })
df1 = pd.DataFrame(scores1,columns=['model','best_score','best_params'])
df1

Unnamed: 0,model,best_score,best_params
0,svm,0.25,{'kernel': 'rbf'}
1,random_forest,0.291667,{'n_estimators': 5}
2,logistic_regression,0.25,{}
3,KNN,0.458333,{'n_neighbors': 4}
4,Decision_Tree,0.208333,{}


As we can see in above table or dataframe the  k-nearest neighbors model giving us the maximum score of 37% and decison tree giving us minimum score of 25% so we will use KNN algoritham model 

In [22]:
#creating instace of knn model
model=KNeighborsClassifier(n_neighbors=4)
#fit the model on the traning data 
model.fit(X_train,y_train)


In [23]:
model.score(X_test,y_test)

0.5

In [24]:
#create a function for taking new input preprocess it and produce the output
def pred(text):
    #i=input("Enter your sentence : ")
    input_text=pre_process(text)
    input_text_vector=vectorizer.transform([input_text])
    a=model.predict(input_text_vector)
    if a==0 :
        return "Active Voice"
    elif a==1 :
        return "Passive Voice"
    
    

In [25]:
#prediction for the new input sentence
i=input("Enter your sentence : ")
pred(i)


Enter your sentence : The car was driven by my father.


'Passive Voice'