In [49]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report

In [51]:
df = pd.read_csv("E:/Downloads/sms.csv",sep="\t",names=["status",'email'])


In [53]:
df.head()

Unnamed: 0,status,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [55]:
len(df)

5572

In [57]:
df.status.value_counts()

status
ham     4825
spam     747
Name: count, dtype: int64

In [17]:
le = LabelEncoder()
df.status = le.fit_transform(df.status)

In [59]:
df.status = df.status.map({"ham":0,"spam":1})

In [61]:
df.head()

Unnamed: 0,status,email
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [63]:
y=df.status
x=df.email

In [65]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.2,random_state=42)

## TF-IDF

In [68]:
cv = TfidfVectorizer(min_df=1,stop_words="english")

In [70]:
x_trainCv = cv.fit_transform(x_train)

In [72]:
x_trainCv.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [74]:
x_testCv = cv.transform(x_test)

In [76]:
x_testCv.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Naive Bayes

In [79]:
mnb = MultinomialNB()

In [81]:
mnb.fit(x_trainCv,y_train)

In [83]:
prediction = mnb.predict(x_testCv)

In [85]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [87]:
cm = confusion_matrix(y_test,prediction)
print(cm)

[[966   0]
 [ 24 125]]


## Bag Of Words

In [89]:
cv1 = CountVectorizer(stop_words="english")

In [None]:
x_traincv1=cv1.transform(x_train)
x_testcv1=cv1.transform(x_test)

In [None]:
x_traincv1.toarray()

In [148]:
mnb1=MultinomialNB()

In [150]:
mnb1.fit(x_traincv1,y_train)

In [152]:
pred = mnb1.predict(x_testcv1)

In [154]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98       966
           1       0.85      0.97      0.91       149

    accuracy                           0.97      1115
   macro avg       0.92      0.97      0.95      1115
weighted avg       0.98      0.97      0.97      1115



In [156]:
cm1 = confusion_matrix(y_test,pred)
print(cm1)

[[940  26]
 [  4 145]]


In [113]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN , Embedding , GRU , LSTM , Dense ,Dropout


In [136]:

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=5000)  # Match with input_dim in Embedding layer
tokenizer.fit_on_texts(x_train)

# Convert text to sequences
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

# Pad sequences to ensure uniform length
max_length = 10  # Match with input_length in Embedding layer
x_train_pad = pad_sequences(x_train_seq, maxlen=max_length, padding='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=max_length, padding='post')

# Convert to numpy arrays with float32 dtype
x_train_pad = np.array(x_train_pad, dtype='float32')
x_test_pad = np.array(x_test_pad, dtype='float32')
y_train = np.array(y_train, dtype='float32')
y_test = np.array(y_test, dtype='float32')

# Now your model building and training code remains the same
def build_model(cell_type="RNN"):
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=64, input_length=10))
    model.add(Dropout(0.2))

    if cell_type == "RNN":
        model.add(SimpleRNN(64, activation='relu'))
    elif cell_type == "LSTM":
        model.add(LSTM(64, activation='tanh'))
    elif cell_type == "GRU":
        model.add(GRU(64, activation='tanh'))

    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Training and testing each model
for cell in ["RNN", "LSTM", "GRU"]:
    print(f"\n🔹 Training model {cell} 🔹\n")
    model = build_model(cell_type=cell)
    model.summary()
    # Use the preprocessed data
    model.fit(x_train_pad, y_train, epochs=10, batch_size=2, validation_data=(x_test_pad, y_test))


🔹 Training model RNN 🔹





Epoch 1/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.9259 - loss: 0.2035 - val_accuracy: 0.9910 - val_loss: 0.0420
Epoch 2/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9929 - loss: 0.0286 - val_accuracy: 0.9901 - val_loss: 0.0456
Epoch 3/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.9968 - loss: 0.0140 - val_accuracy: 0.9812 - val_loss: 0.0703
Epoch 4/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9944 - loss: 0.0184 - val_accuracy: 0.9830 - val_loss: 0.0875
Epoch 5/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.9966 - loss: 0.0084 - val_accuracy: 0.9892 - val_loss: 0.0812
Epoch 6/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9984 - loss: 0.0047 - val_accuracy: 0.9830 - val_loss: 0.1589
Epoch 7/10
[1m

Epoch 1/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 5ms/step - accuracy: 0.9353 - loss: 0.1965 - val_accuracy: 0.9857 - val_loss: 0.0473
Epoch 2/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 4ms/step - accuracy: 0.9921 - loss: 0.0307 - val_accuracy: 0.9883 - val_loss: 0.0440
Epoch 3/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9960 - loss: 0.0149 - val_accuracy: 0.9892 - val_loss: 0.0562
Epoch 4/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.9984 - loss: 0.0049 - val_accuracy: 0.9883 - val_loss: 0.0525
Epoch 5/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.9993 - loss: 0.0024 - val_accuracy: 0.9883 - val_loss: 0.0814
Epoch 6/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.9999 - loss: 8.1822e-04 - val_accuracy: 0.9883 - val_loss: 0.0687
Epoch 7/

Epoch 1/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 5ms/step - accuracy: 0.9379 - loss: 0.1851 - val_accuracy: 0.9892 - val_loss: 0.0398
Epoch 2/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - accuracy: 0.9929 - loss: 0.0285 - val_accuracy: 0.9874 - val_loss: 0.0415
Epoch 3/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.9970 - loss: 0.0130 - val_accuracy: 0.9928 - val_loss: 0.0424
Epoch 4/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - accuracy: 0.9982 - loss: 0.0066 - val_accuracy: 0.9883 - val_loss: 0.0765
Epoch 5/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - accuracy: 1.0000 - loss: 2.5457e-04 - val_accuracy: 0.9857 - val_loss: 0.1113
Epoch 6/10
[1m2229/2229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.9999 - loss: 5.9907e-04 - val_accuracy: 0.9883 - val_loss: 0.0715
Ep

In [None]:
# from airflow import DAG
# from airflow.operators.python import PythonOperator
# from datetime import datetime, timedelta

# default_args = {
#     "owner": "boss",
#     "start_date": datetime(2025, 3, 8),
#     "retries": 1,
# }

# dag = DAG(
#     "nlp_pipeline",
#     default_args=default_args,
#     schedule_interval="0 3 * * *", 
# )

In [None]:
# task1 = PythonOperator(
#     task_id="get_twitter_data",
#     python_callable=get_twitter_data,
#     dag=dag,
# )

# task2 = PythonOperator(
#     task_id="clean_data",
#     python_callable=clean_data,
#     dag=dag,
# )

# task3 = PythonOperator(
#     task_id="transform_data",
#     python_callable=transform_data,
#     dag=dag,
# )

# task4 = PythonOperator(
#     task_id="send_to_api",
#     python_callable=send_to_api,
#     dag=dag,
# )