

```
# This is formatted as code
```

***You should run the below cell once every few months just to get new joblib files for the xgb classifer and vectorizer. It accesses the database from online, and trains a model for it. Then the model downloads two joblib files in the local directory of the py file. I will also provide this code as a .py file.***

In [None]:
import firebase_admin
from firebase_admin import firestore
from firebase_admin import credentials
import time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
import joblib

# Load the service account key JSON file
cred = credentials.Certificate('C:/Users/advai/Desktop/clupp internship/auto-clupp-firebase-adminsdk-evw67-b6eb6f3b75.json')

# Initialize the Firebase Admin SDK with the service account credentials
firebase_admin.initialize_app(cred)

# Initialize Firestore client
db = firestore.client()

# Reference the 'IT/WhatsApp/predictions' collection in Firestore
collection_ref = db.collection('IT').document('whatsApp').collection('predictions')

# Retrieve all documents from the 'IT/WhatsApp/predictions' collection
documents = collection_ref.get()

# expressions to filter out
expressions_buffers = {
    'hola': 11,
    'buen': 20,
    'que tal': 3,
    'si': 10,
    'ok': 10,
    'acuerdo': 20,
    'gracias': 20,
    'no': 10,
    'hi': 11
}
# check for expression method
def filter(string):
    for expression,buffer in expressions_buffers.items():
        if expression.lower() in string.lower() and len(string) < (len(expression) + buffer):
            return True
    return False

distribution = [0,0,0,0,0,0,0,0,0,0,0]
# making dict
db = {}
deleted_list = []
for document in documents:
    if 'choice' in document.to_dict():
        choice = document.to_dict()['choice']
    else:
        reference = {
            0: "unknown", 1: "payment", 2: "pictures", 3: "sales", 4: "human",
            5: "cancel", 6: "tech", 7: "policy", 8: "bye", 9: "hello", 10:
                "claims"
        }
        antiref = {
            "unknown": 0, "payment": 1, "pictures": 2, "sales": 3, "human": 4,
            "cancel": 5, "tech": 6, "policy": 7, "bye": 8, "hello": 9,
            "claims": 10
        }

        predictions = document.to_dict()['prediction']
        entries = predictions[0]["structValue"]["fields"]["confidences"]\
            ["listValue"]["values"]
        float_values = []
        for entry in entries:
            float_values.append(entry["numberValue"])
        index = float_values.index(max(float_values))


        choice = reference[index]


    try:
        distribution[antiref[choice]] = distribution[antiref[choice]] + 1

        # filtering out unneeded data
        if filter(document.to_dict()['message']):
            deleted_list.append(document.to_dict()['message'])
            continue
        #filling out dict
        doc_id = document.id
        message = document.to_dict()['message']
        timestamp = document.to_dict()['timestamp']
        timeraw = timestamp / 1000
        timee = time.ctime(timeraw)

        db[doc_id] = {
            'message': message,
            'choice': choice,
            'timestamp': timee
        }

    except:
        continue

print(distribution)

# Splitting the data into training and testing sets
# Stratify parameter ensures even distribution of classes in train and test sets
messages = []
choices = []
for ID, info in db.items():
    messages.append(info['message'])
    choices.append(info['choice'])

X_train, X_test, y_train, y_test = train_test_split(
    messages, choices, test_size=0.30, stratify=choices, random_state=42
)

#vectorizing labels
reference = {
    "unknown": 0, "payment": 1, "pictures": 2, "sales": 3, "human": 4,
    "cancel": 5, "tech": 6, "policy": 7, "bye": 8, "hello": 9, "claims": 10
}
y_train_numeric = []
for label in y_train:
    for key, value in reference.items():
        if label == key:
            y_train_numeric.append(value)
            break

y_test_numeric = []
for label in y_test:
    for key, value in reference.items():
        if label == key:
            y_test_numeric.append(value)
            break

y_train = y_train_numeric
y_test = y_test_numeric

#vectorizing messages
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

X_train = X_train_vectorized
X_test = X_test_vectorized

# Creating and training the random forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Predicting on the test set using the random forest classifier
rf_predictions = rf_classifier.predict(X_test)

# Calculating the accuracy and F1 score of the random forest model
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_f1_score = f1_score(y_test, rf_predictions, average='weighted')
print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest F1 Score:", rf_f1_score)

# Creating and training the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(random_state=42)
xgb_classifier.fit(X_train, y_train)

# Predicting on the test set using the XGBoost classifier
xgb_predictions = xgb_classifier.predict(X_test)

# Calculating the accuracy and F1 score of the XGBoost model
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
xgb_f1_score = f1_score(y_test, xgb_predictions, average='weighted')
print("XGBoost Accuracy:", xgb_accuracy)
print("XGBoost F1 Score:", xgb_f1_score)


joblib.dump(xgb_classifier, 'xgb_model.joblib')
joblib.dump(vectorizer, 'vectorizer.joblib')

***The 2 small cells below this are to import the joblib files, but they won't be neccessary in the actual application since the syntax is specific to google collab. ***

In [None]:
from google.colab import files
import joblib
uploaded = files.upload()
xgb_classifier = joblib.load(next(iter(uploaded.keys())))

Saving xgb_model.joblib to xgb_model.joblib


In [None]:
uploaded = files.upload()
vectorizer = joblib.load(next(iter(uploaded.keys())))


Saving vectorizer.joblib to vectorizer.joblib


***The code below is the classification script for any input string. ***

In [None]:
reference = {
    "unknown": 0, "payment": 1, "pictures": 2, "sales": 3, "human": 4,
    "cancel": 5, "tech": 6, "policy": 7, "bye": 8, "hello": 9, "claims": 10
}

def classify_message(message):
    # Vectorize
    message_vectorized = vectorizer.transform([message])
    # Predict
    xgb_probabilities = xgb_classifier.predict_proba(message_vectorized)[0]

    # Predicted probabilities
    xgb_results = [(key, probability) for key, probability in zip(reference.keys(), xgb_probabilities)]
    xgb_results.sort(key=lambda x: x[1], reverse=True)

    return xgb_results

while True:
    input_message = input("Enter a message: ")
    result = classify_message(input_message)
    print("")
    print("XGBoost Classification Results:")
    for class_label, probability in result:
        print(class_label, ": ", probability)
    print("")

Enter a message: hola

XGBoost Classification Results:
unknown :  0.869751
sales :  0.051400114
pictures :  0.029096397
payment :  0.025286946
human :  0.009296991
policy :  0.008927708
hello :  0.0025628929
cancel :  0.0017266416
tech :  0.0016361878
claims :  0.00016196375
bye :  0.00015317524

Enter a message: hi

XGBoost Classification Results:
unknown :  0.8221779
sales :  0.049051266
payment :  0.039934248
pictures :  0.029610692
human :  0.02306418
policy :  0.019360505
hello :  0.006617678
cancel :  0.005522205
tech :  0.004377409
claims :  0.00015456244
bye :  0.00012930701

Enter a message: puta

XGBoost Classification Results:
unknown :  0.8221779
sales :  0.049051266
payment :  0.039934248
pictures :  0.029610692
human :  0.02306418
policy :  0.019360505
hello :  0.006617678
cancel :  0.005522205
tech :  0.004377409
claims :  0.00015456244
bye :  0.00012930701



KeyboardInterrupt: ignored

Sentiment Analysis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flair
  Downloading flair-0.12.2-py3-none-any.whl (373 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m373.1/373.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting segtok>=1.5.7 (from flair)
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Collecting mpld3==0.3 (from flair)
  Downloading mpld3-0.3.tar.gz (788 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m788.5/788.5 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sqlitedict>=1.6.0 (from flair)
  Downloading sqlitedict-2.1.0.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting deprecated>=1.2.4 (from flair)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting boto3 (from flair)
  Downloading boto3-1.26.156-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━

ImportError: ignored