## Emoji expression dection using nlp models


In [61]:
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

nltk.download('stopwords')

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    words = text.split()
    words = [w for w in words if w not in stopwords.words('english')]
    return " ".join(words)

df = pd.read_csv('data.csv')

# Check for missing values and clean data
df = df.dropna()
df['text'] = df['text'].astype(str).str.strip()
df['emoji'] = df['emoji'].astype(str).str.strip()
df = df[df['emoji'] != ""]
df = df[df['text'] != ""]
df = df.reset_index(drop=True)

# Train model
vecorized = TfidfVectorizer()
x = vecorized.fit_transform(df['text'].apply(clean_text))
y = df['emoji']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=300)
model.fit(x_train, y_train)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pdhar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [62]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluate the model
pred = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))

# Define prediction function
def predict_emoji(text):
    text = clean_text(text)
    vector = vecorized.transform([text])
    pred = model.predict(vector)[0]
    return pred

# Test the function
print(predict_emoji("Thank you so much for your help!"))


Accuracy: 0.23333333333333334
              precision    recall  f1-score   support

          ‚ù§Ô∏è       0.25      1.00      0.40         1
           üòÇ       0.50      0.50      0.50         2
           üòÑ       0.50      0.50      0.50         2
           üòä       0.00      0.00      0.00         2
           üòå       0.00      0.00      0.00         1
           üòç       0.00      0.00      0.00         3
           üòé       0.06      1.00      0.12         1
           üòê       0.00      0.00      0.00         1
           üò°       0.50      0.50      0.50         2
           üò¢       0.00      0.00      0.00         2
           üò®       1.00      0.25      0.40         4
           üò∞       0.00      0.00      0.00         2
           üò≤       0.50      0.50      0.50         2
           üò¥       0.00      0.00      0.00         1
           ü§¢       0.00      0.00      0.00         2
           ü§£       0.00      0.00      0.00         1
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Save the model & vectorizer

In [63]:
import pickle

with open("emoji-model.pkl", "wb") as f:
    pickle.dump(model, f)

with open ("vecorized.pkl", "wb") as f:
    pickle.dump(vecorized, f)
    