In [None]:
!pip install nltk

In [None]:
!pip install --upgrade numpy scipy


In [None]:
import nltk
nltk.download('vader_lexicon')

In [6]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.4 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.5.1 threadpoolctl-3.5.0


In [25]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Sample data to demonstrate processing
conversation = """07/06/24, 10:22 pm - Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more.
08/06/24, 8:47 am - Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more.
19/06/24, 1:57 pm - Aditya: When did you apply??
19/06/24, 1:58 pm - +91 98802 19952: Ma'am before I gave test after that 2nd chance will be given they told know ma'am please I didn't get that mail
19/06/24, 2:03 pm - Aditya: When did you first give the test?
19/06/24, 2:11 pm - +91 98802 19952: 13th ma'am
19/06/24, 3:15 pm - Aditya: That's good to know. Thank you!
19/06/24, 3:20 pm - +91 98802 19952: No problem, happy to help.
19/06/24, 4:00 pm - Aditya: Good luck with your next attempt!
19/06/24, 4:15 pm - +91 98802 19952: Thank you!
20/06/24, 9:00 am - Aditya: I hope everything is going well.
20/06/24, 9:15 am - +91 98802 19952: Yes, everything is good so far.
20/06/24, 10:00 am - Aditya: Great! Have a good day!
20/06/24, 10:15 am - +91 98802 19952: Thanks, you too!"""

# Split the conversation into lines
lines = conversation.split('\n')

def date_time(s):
    # Adjust the pattern to match the provided date format
    pattern = r'^\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2} [apm]{2} - '
    result = re.match(pattern, s)
    return bool(result)

def get_message(line):
    pattern = r'^(\d{2}/\d{2}/\d{2}), (\d{1,2}:\d{2} [apm]{2}) - (.*?): (.*)'
    match = re.match(pattern, line)
    if match:
        date, time, author, message = match.groups()
    else:
        pattern_no_author = r'^(\d{2}/\d{2}/\d{2}), (\d{1,2}:\d{2} [apm]{2}) - (.*)'
        match_no_author = re.match(pattern_no_author, line)
        if match_no_author:
            date, time, message = match_no_author.groups()
            author = None
        else:
            date, time, author, message = None, None, None, line
    return date, time, author, message

data = []
messageBuffer = []
date, time, author = None, None, None

for line in lines:
    print(f"Processing line: {line}")
    if date_time(line):
        print(f"Date-time match: True for line: {line}")
        if len(messageBuffer) > 0:
            data.append([date, time, author, ' '.join(messageBuffer)])
        messageBuffer.clear()
        date, time, author, message = get_message(line)
        messageBuffer.append(message)
    else:
        print(f"Date-time match: False for line: {line}")
        messageBuffer.append(line)

if len(messageBuffer) > 0:
    data.append([date, time, author, ' '.join(messageBuffer)])

print("Extracted Messages:")
df = pd.DataFrame(data, columns=["Date", "Time", "Contact", "Message"])
print(df)

# Ensure there are messages to process
if df.empty:
    raise ValueError("No valid messages found in the dataset")

# Feature extraction
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['Message'])

# Define target labels (sentiment)
df['Sentiment'] = [1 if 'good' in msg.lower() else 0 for msg in df['Message']]

# Check if we have both classes
if len(df['Sentiment'].unique()) < 2:
    raise ValueError("The dataset contains only one class.")

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(X, df['Sentiment'], test_size=0.2, random_state=42, stratify=df['Sentiment'])

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


Processing line: 07/06/24, 10:22 pm - Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more.
Date-time match: True for line: 07/06/24, 10:22 pm - Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more.
Processing line: 08/06/24, 8:47 am - Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more.
Date-time match: True for line: 08/06/24, 8:47 am - Messages and calls are end-to-end encrypted. No one outside of this chat, not even WhatsApp, can read or listen to them. Tap to learn more.
Processing line: 19/06/24, 1:57 pm - Aditya: When did you apply??
Date-time match: True for line: 19/06/24, 1:57 pm - Aditya: When did you apply??
Processing line: 19/06/24, 1:58 pm - +91 98802 19952: Ma'am before I gave test after that 2nd chance will be give

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
