<a href="https://colab.research.google.com/github/Armaghan6623/Harmful-vs-Harmless-Text-Classification-using-NLP-Transformers/blob/main/VA_project_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# STEP 1: Problem Definition
# Input: Text-based meeting transcript
# Output:
#  - Detected intent (question / decision / opinion)
#  - Extracted entities (date, time, task)
#  - Agent action (respond / defer / escalate)
#
# Phase 1 focus: NLP understanding (no avatar, no voice)


In [4]:
import pandas as pd
df=pd.read_csv("/content/meeting_dataset.csv")

In [5]:
df.head()

Unnamed: 0,meeting_id,title,date,start_time,end_time,duration_minutes,organizer,participants,meeting_type,platform,location,agenda,status
0,M001,Project Kickoff,2025-01-05,10:00,11:00,60,Ali Khan,Ali; Sara; Ahmed,Team,Zoom,Online,Project overview and roles,Completed
1,M002,AI Research Discussion,2025-01-07,14:00,15:30,90,Muhammad Armaghan,Armaghan; Usman; Bilal,Research,Google Meet,Online,PERDET framework improvements,Completed
2,M003,Client Requirement Meeting,2025-01-10,09:00,10:00,60,Sara Malik,Sara; Client A,Client,Physical,Office Room 2,Requirement gathering,Completed
3,M004,Sprint Planning,2025-01-12,11:00,12:30,90,Ahmed Raza,Ahmed; Dev Team,Agile,Microsoft Teams,Online,Sprint task allocation,Scheduled
4,M005,Internship Interview,2025-01-15,16:00,16:30,30,NADRA HR,HR; Candidate,Interview,Physical,NADRA Office,Technical interview,Scheduled


In [6]:
df.keys()

Index(['meeting_id', 'title', 'date', 'start_time', 'end_time',
       'duration_minutes', 'organizer', 'participants', 'meeting_type',
       'platform', 'location', 'agenda', 'status'],
      dtype='object')

Data Preprocessing

In [11]:
import re
import contractions
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [12]:
# oneâ€‘time downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text,
                    do_expand=True,
                    remove_digits=True,
                    remove_punct=True,
                    do_stem=False,
                    do_lemma=True,
                    remove_stop=True):
    # 1. to string and lowercase
    text = str(text).lower()

    # 2. expand contractions
    if do_expand:
        text = contractions.fix(text)

    # 3. remove words containing digits
    if remove_digits:
        text = re.sub(r'\w*\d\w*', ' ', text)

    # 4. remove punctuation
    if remove_punct:
        text = re.sub(r'[^\w\s]', ' ', text)

    # 5. tokenize
    tokens = word_tokenize(text)

    # 6. remove stop words
    if remove_stop:
        tokens = [w for w in tokens if w not in stop_words]

    # 7. stemming / lemmatization
    if do_stem:
        tokens = [stemmer.stem(w) for w in tokens]
    if do_lemma:
        tokens = [lemmatizer.lemmatize(w) for w in tokens]

    # join back to string
    return " ".join(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


TF=IDF

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Apply preprocessing to create the 'stemmed_text' column
df['stemmed_text'] = df['agenda'].apply(preprocess_text)

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

tfidf = TfidfVectorizer(
  ngram_range=(1,2),      # use unigrams and bigrams
    max_features=20000,     # limit vocab size for speed
    stop_words='english'    # remove common stopwords
)

X_train = tfidf.fit_transform(train_df['stemmed_text'])
X_test  = tfidf.transform(test_df['stemmed_text'])
y_train = train_df['label']
y_test  = test_df['label']

print("TF-IDF shape:", X_train.shape)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
