In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Load data
df = pd.read_csv("cnq1.csv")
df.head()

Unnamed: 0,Questions,Modules
0,Consider five source symbols of a discrete mem...,5
1,Compare Bus and Star topology,1
2,Compare Message Switching and Circuit Switching,2
3,"Compare LAN,MAN,WAN",1
4,Draw and explain the OSI Reference Model,1


In [2]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Questions'], df['Modules'], test_size=0.2, random_state=42)
#print(X_train)
#print(X_test)
#print(y_train)
print(y_test)

9      4
184    2
120    1
207    5
148    3
215    5
182    3
86     6
178    3
175    3
68     2
15     4
200    3
25     3
30     2
45     4
60     2
176    3
101    2
165    2
79     3
18     5
167    3
152    2
96     2
82     4
193    2
172    3
221    6
104    2
118    2
75     1
16     5
198    4
201    2
117    2
195    2
114    4
111    3
144    2
141    4
213    5
143    2
223    6
142    2
163    2
Name: Modules, dtype: int64


In [3]:
# Create a pipeline for vectorizing text data and training a LinearSVC classifier
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('clf', LinearSVC())
])
print(text_clf)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])


In [4]:
# Train the classifier on the vectorized text data
text_clf.fit(X_train, y_train)

In [5]:
# Predict the categories of the test data using the trained model
y_pred = text_clf.predict(X_test)
print(X_test)
print(y_pred)

9                                    Compare TCP and UDP
184                                             Framing 
120    What is network , what are its goals and appli...
207                                       Huffman coding
148                                           IP address
215                                                 WWW 
182                                              Class D
86                                        berkely socket
178                                                  BGP
175                                          subnetting 
68                                              Switches
15                                           TCP Timers 
200                                                   IP
25                      What is IP , explain IPV6 header
30                      Examine different types of ALOHA
45                         Explain the TCP header format
60                       Explain ALOHA and slotted ALOHA
176                            

In [6]:
# Evaluate the accuracy and other metrics of the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7608695652173914
              precision    recall  f1-score   support

           1       0.33      0.50      0.40         2
           2       0.68      0.94      0.79        18
           3       1.00      0.73      0.84        11
           4       0.88      1.00      0.93         7
           5       1.00      0.40      0.57         5
           6       0.00      0.00      0.00         3

    accuracy                           0.76        46
   macro avg       0.65      0.60      0.59        46
weighted avg       0.76      0.76      0.73        46



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
from joblib import dump
dump(text_clf, 'text_classifier.joblib')

['text_classifier.joblib']