In [40]:
import string
import nltk
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('wordnet')

nltk.download('punkt')  # Download the tokenizer models if not already done


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\marya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
#read the data file from sentences.csv file ,and then preprocess the data
import pandas as pd
# read the data file without index column
data = pd.read_csv('sentences.csv')
data['my_sentence'].fillna('', inplace=True)

print(f"list size is {len(data)}")

print(data.head())




list size is 1865
                                         my_sentence    labels
0  Communication begins before your child speaks ...  learning
1  Making eye contact, gesturing, making sounds, ...  learning
2  When shown simple sign language, children ofte...  learning
3  Then, as spoken language develops, sign langua...  learning
4  Children need a way to communicate to control ...  learning


### Explore the Data
Before going forward with the data and moving on to model training, let's check the number of unique labels in our labels column.

In [51]:
# Display unique labels and their frequency
label_counts = data['labels'].value_counts()
print(label_counts)

labels
stories                 312
what is hearing loss    237
fiction books           193
identities of deaf      170
deaf                    144
hearing implantation    130
loud music              114
hard hearing            109
Improve your English    102
learning                 98
ear poisoning            96
hearing test             89
Caring                   55
learning                 16
Name: count, dtype: int64


Encode the Labels
Machine learning models require numerical inputs, so we'll need to convert these categorical labels into a numerical format. One common approach is to use label encoding.

In [52]:
from sklearn.preprocessing import LabelEncoder

# Initialize the encoder
label_encoder = LabelEncoder()

# Fit and transform the labels to numeric values
data['encoded_labels'] = label_encoder.fit_transform(data['labels'])

# Check the mapping of encoded labels
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label mapping:", label_mapping)


Label mapping: {'Caring': 0, 'Improve your English': 1, 'deaf': 2, 'ear poisoning': 3, 'fiction books': 4, 'hard hearing': 5, 'hearing implantation': 6, 'hearing test': 7, 'identities of deaf': 8, 'learning': 9, 'learning ': 10, 'loud music': 11, 'stories': 12, 'what is hearing loss': 13}


In [164]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize.punkt import PunktSentenceTokenizer
import string
import re

# Ensure necessary resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')  # If using POS tagging in other parts

# Set up stop words
stop_words = set(stopwords.words('english'))

# Initialize the tokenizer with custom abbreviations
tokenizer = PunktSentenceTokenizer()
abbreviations = {'dr', 'mr', 'mrs', 'ms', 'inc', 'e.g', 'i.e', 'vs', 'etc'}
for abbr in abbreviations:
    tokenizer._params.abbrev_types.add(abbr)

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove hashtags (by removing the # and keeping the word)
    text = re.sub(r'#(\w+)', r'\1', text)
    # Expand common abbreviations
    abbreviations_dict = {
        'dr.': 'doctor',
        'mr.': 'mister',
        'mrs.': 'mistress',
        'ms.': 'miss',
        'inc.': 'incorporated',
        'e.g.': 'for example',
        'i.e.': 'that is',
        'vs.': 'versus',
        'etc.': 'and so on'
    }
    for abbr, full in abbreviations_dict.items():
        text = text.replace(abbr, full)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove digits
    text = re.sub(r'\d+', '', text)
    return text

# Process each entry in the column
myColumn = data.iloc[:, 0].values  #  'data' is my DataFrame
processed_texts = []
for text in myColumn:
    preprocessed_text = preprocess_text(text)
    # Tokenize into sentences considering custom abbreviations
    tokenized_sentences = tokenizer.tokenize(preprocessed_text)
    processed_sentences = []
    for sentence in tokenized_sentences:
        # Tokenize each sentence into words and remove stop words
        words = word_tokenize(sentence)
        filtered_words = [word for word in words if word not in stop_words and word.isalpha()]  # isalpha() removes any standalone punctuation left
        processed_sentence = ' '.join(filtered_words)
        processed_sentences.append(processed_sentence)
    processed_texts.append(processed_sentences)

    # Optional: Print the original and processed text for verification
    print(f"Original text: {text}")
    print(f"Processed sentences: {processed_sentences}\n")

print("Number of entries processed:", len(processed_texts))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\marya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\marya\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Original text: Communication begins before your child speaks his or her first word.
Processed sentences: ['communication begins child speaks first word']

Original text: Making eye contact, gesturing, making sounds, and pointing are some of the ways that children can tell you what they want and need.
Processed sentences: ['making eye contact gesturing making sounds pointing ways children tell want need']

Original text: When shown simple sign language, children often can com municate basic wants and needs earlier than they could using words.
Processed sentences: ['shown simple sign language children often com municate basic wants needs earlier could using words']

Original text: Then, as spoken language develops, sign language usually decreases.
Processed sentences: ['spoken language develops sign language usually decreases']

Original text: Children need a way to communicate to control their environment and to ease frustration.
Processed sentences: ['children need way communicate cont

**Key Components of the Code**
Abbreviations: The code handles abbreviations both by informing the tokenizer not to treat them as sentence boundaries and by expanding them to their full forms during preprocessing.
Case Normalization: All text is converted to lowercase to ensure uniformity.
Stop Words Removal: Stop words are removed during the word tokenization step.
Punctuation and Numbers Removal: Punctuation and numbers are removed using str.translate and regex respectively.
Hashtag Handling: Hashtags are stripped off the '#' character but the word is kept.

In [166]:
processed_texts

[['communication begins child speaks first word'],
 ['making eye contact gesturing making sounds pointing ways children tell want need'],
 ['shown simple sign language children often com municate basic wants needs earlier could using words'],
 ['spoken language develops sign language usually decreases'],
 ['children need way communicate control environment ease frustration'],
 ['need make choices power order develop independence'],
 ['children developing speech expected rate express frustration non verbally learn use appropriate words'],
 ['sign language give children opportunities ake choices control environment'],
 ['sign language discourage learning talk'],
 [''],
 ['signs always paired spoken words'],
 ['learning sign helps child make connection object label'],
 ['signs teach power language learning signs many children eager use words'],
 ['help child'],
 ['natural gestures help child communicate waving hello bye bye arms pick'],
 ['head shake yes pushing away hands dont want'],
 [

In [167]:
   from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    """Converts POS tags to a format that WordNetLemmatizer can understand."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default POS
def lemmatize_text(text):
    if not isinstance(text, str):
        raise ValueError("Input text is not a string")  # Add a check for string type
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
    return ' '.join(lemmatized_words)

# If `data` is a list of lists, flatten it first or ensure it's processed correctly
if any(isinstance(el, list) for el in processed_texts):
    # Flatten the list if necessary
    processed_texts = [' '.join(sublist) if isinstance(sublist, list) else sublist for sublist in processed_texts]

lemmatized_texts = [lemmatize_text(text) for text in processed_texts]


# After defining and using the lemmatization function, check the output:
print("Sample lemmatized texts:")
for sample in lemmatized_texts[:5]:
    print(sample)  # Should print strings, not lists


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\marya\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Sample lemmatized texts:
communication begin child speaks first word
make eye contact gesture make sound point way child tell want need
show simple sign language child often com municate basic want need earlier could use word
spoken language develop sign language usually decrease
child need way communicate control environment ease frustration


In [168]:
print(len(lemmatized_texts))
print(type(lemmatized_texts)
)


1865
<class 'list'>



## Feature Extraction :
For feature extraction I will choose three distinct methods of text representation—TF-IDF, GloVe, and GPT which can cover a broad spectrum of natural language processing capabilities. Here’s a brief introduction to why these three methods are particularly suitable:

1. TF-IDF (Term Frequency-Inverse Document Frequency)
TF-IDF is a statistical measure used to evaluate the importance of a word to a document in a collection or corpus. It provides a weight to each word which signifies the relevance of the word for a particular document. The rationale for using TF-IDF for text analysis is its effectiveness in filtering out common words that are frequently used across all documents but hold little meaning in distinguishing a document in a given corpus. 

2. GloVe (Global Vectors for Word Representation)
GloVe is an unsupervised learning algorithm for generating vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space. Choosing GloVe is beneficial for text analysis because it captures both the semantic and syntactic aspects of words. This can enhance performance in tasks like thematic similarity detection, where context and the relational meaning of words are crucial.

3. GPT (Generative Pre-trained Transformer)
GPT, developed by OpenAI, is a state-of-the-art language model that uses the transformer architecture. It is pre-trained on a large corpus and fine-tuned for specific tasks. For chatbot Q/A, GPT can be particularly useful due to its ability to understand and generate human-like text. This capability makes it ideal for tasks that require a deep understanding of language, such as generating contextual responses or enhancing language-based data augmentation strategies for other learning models in social media analysis.

In [170]:
from nltk.tokenize import sent_tokenize

# Assume lemmatized_text is your long single string
sentences = sent_tokenize(lemmatized_texts)  # This splits the text into sentences

# Now verify the new structure
print(type(lemmatized_texts))  # Should be <class 'list'>
print(len(lemmatized_texts))   # Should reflect the number of sentences
print(type(lemmatized_texts[0]))  # Each item should be <class 'str'>

TypeError: expected string or bytes-like object, got 'list'

<class 'list'>: This shows that sentences (derived from the lemmatized_text) is a list, which is the correct format for processing multiple documents or sentences in TfidfVectorizer.

1969: This number indicates that the large text string has been successfully segmented into 1,969 separate sentences. Each of these sentences will be treated as an individual document by the TF-IDF model.

<class 'str'>: Each element in the list is a string, which is the required format for each document (or sentence in this case) when using TfidfVectorizer.

#### 1. TF-IDF with scikit-learn
First, let's create a TF-IDF vector representation of text data. This representation will help identify the relevance of words in our text relative to the corpus.

In [171]:
print(lemmatized_texts)

['communication begin child speaks first word', 'make eye contact gesture make sound point way child tell want need', 'show simple sign language child often com municate basic want need earlier could use word', 'spoken language develop sign language usually decrease', 'child need way communicate control environment ease frustration', 'need make choice power order develop independence', 'child develop speech expect rate express frustration non verbally learn use appropriate word', 'sign language give child opportunity ake choice control environment', 'sign language discourage learning talk', '', 'sign always pair spoken word', 'learn sign help child make connection object label', 'sign teach power language learn sign many child eager use word', 'help child', 'natural gesture help child communicate wave hello bye bye arm pick', 'head shake yes push away hand dont want', 'clap hand let play yay', 'kiss blowing kiss love', 'point desire toy give choice face expression convey emotion happy 

In [172]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
# Assuming 'lemmatized_texts' is a list of lemmatized sentences/documents
tfidf_matrix = tfidf_vectorizer.fit_transform(lemmatized_texts)
print("Shape of the TF-IDF Matrix:", tfidf_matrix.shape)  # (n_documents, n_features)

# Optionally, view some of the feature names (words) to ensure it looks correct
feature_names = tfidf_vectorizer.get_feature_names_out()
print("Some feature names:", feature_names[:10])  # Print first 10 features


Shape of the TF-IDF Matrix: (1865, 5225)
Some feature names: ['aa' 'aadpubmed' 'ababa' 'abbreviation' 'ability' 'able' 'aboo' 'abr'
 'absolutamente' 'absolutely']


**Shape of the TF-IDF Matrix: (1865, 5225)**
Document Count (1865): This figure shows that my dataset contains 1865 documents (or sentences), which have been successfully processed and vectorized. This is consistent with the number of entries you aimed to process, indicating that each text entry has been accounted for in the TF-IDF transformation.

Feature Count (5225): The feature count of 5225 suggests that after all the preprocessing and tokenization, 5225 unique terms were identified as significant across my corpus. This is a relatively large number of features, indicating a diverse vocabulary:

Positive Aspect: A higher number of features can be beneficial if these features are relevant and contribute to distinguishing between document topics or classifications effectively.
Potential Concern: A large feature set may also include noise—terms that don’t contribute much to the analysis or could even detract from model performance, particularly if many of these features are very infrequent.
Sample Feature Names
The sample feature names like ['aa', 'aadpubmed', 'ababa', 'abbreviation', 'ability', 'able', 'aboo', 'abr', 'absolutamente', 'absolutely'] provide further context:

Variety in Vocabulary: The presence of diverse terms from different contexts ('absolutamente', 'ability', 'abbreviation') suggests that my text covers a range of topics or includes multiple languages. This could be useful or could require further cleaning depending on my specific analysis goals.
Possible Noise: Terms like 'aa', 'aboo', and 'abr' may be less informative and could be considered noise, depending on their relevance to the dataset’s context.

#### 2. Using GloVe Embeddings
To use GloVe embeddings, we need to download the GloVe pre-trained word vectors and load them. we can then average the word vectors for each sentence to get a fixed-size vector representation for each sentence.  

In [173]:
import numpy as np

# Load GloVe Word Vectors
def load_glove_model(file_path):
    glove_model = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array([float(val) for val in split_line[1:]])
            glove_model[word] = embedding
    return glove_model

# Assuming GloVe vectors are in 'glove.6B.50d.txt' which is a 50-dimensional GloVe model
glove_vectors = load_glove_model('glove.6B.50d.txt')

# Function to create sentence vectors
def sentence_vector(sentence, model):
    words = sentence.split()
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros(50)  # Assuming 50-dimensional embeddings
    return np.mean(word_vectors, axis=0)

# Apply to all sentences
glove_sentence_vectors = np.array([sentence_vector(sentence, glove_vectors) for sentence in lemmatized_texts])
print(glove_sentence_vectors.shape)
print(glove_sentence_vectors)

(1865, 50)
[[ 0.07335833  0.268995   -0.27888533 ...  0.17042    -0.07053583
   0.19507833]
 [ 0.32388833  0.17898283 -0.036027   ...  0.06833217 -0.03334708
   0.32959278]
 [ 0.2081965   0.169008   -0.04332964 ...  0.32339279  0.00882557
   0.150313  ]
 ...
 [ 0.114345    0.16667267 -0.3241     ...  0.20914     0.05148033
  -0.18147133]
 [ 0.464405   -0.18016225 -0.0072225  ...  0.4716525  -0.11620125
   0.249106  ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]


**Shape of GloVe Vectors: (1865, 50)**
1865 Vectors: This corresponds to the 1865 documents or sentences in the dataset. Each sentence has been represented as a vector, which is great as it confirms that each piece of text has been processed into a numerical form.

50 Dimensions: Each vector has 50 dimensions, which is typical for a pre-trained GloVe model like the glove.6B.50d you might be using. This dimensional size offers a balance between capturing semantic details and maintaining computational efficiency.

**Contents of the GloVe Vectors**
The actual values in the vectors (0.07335833, 0.268995, -0.27888533, ..., 0.19507833) represent the semantic encoding of each sentence. These values are derived by averaging the embeddings of all words in each sentence, which captures the overall semantic meaning.

Non-zero Values: Most of the numbers being non-zero and varying significantly indicates that the embeddings are indeed capturing varied semantic information across the dataset.

Zeros in the Last Vector: The presence of a vector filled with zeros (the last one in the printout) suggests that the corresponding sentence might be empty, contain only stop words, or only include words that were not found in the GloVe vocabulary. This needs attention:



#### 3. GPT Embeddings with Hugging Face transformers
Using a pre-trained GPT model to obtain contextual embeddings involves leveraging the transformers library.

In [177]:
from transformers import GPT2Model, GPT2Tokenizer
import torch
import numpy as np

# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Since GPT-2 doesn't have a dedicated pad token, we can use the eos token for padding.
tokenizer.pad_token = tokenizer.eos_token

# Load pre-trained model
model = GPT2Model.from_pretrained('gpt2')

# Ensure the model is in evaluation mode
model.eval()

# Function to get embeddings
def get_gpt_embeddings(text):
    # Encode text to tensor format with padding and truncation
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    # Get model outputs. No gradient is needed for inference, so we use torch.no_grad()
    with torch.no_grad():  # Deactivates autograd, reducing memory usage and speeding up computations
        outputs = model(**inputs)
    # Return the mean of the last hidden state as the sentence embedding
    return outputs.last_hidden_state.mean(dim=1).squeeze()

# Assuming 'lemmatized_texts' is a list of lemmatized sentences
# Ensure each sentence is a string and not empty
lemmatized_texts = [sentence for sentence in lemmatized_texts if isinstance(sentence, str) and sentence.strip()]

# Getting embeddings for each sentence and ensuring dimension reduction
gpt_embeddings = np.array([get_gpt_embeddings(sentence).detach().numpy() for sentence in lemmatized_texts])
print("Shape of GPT embeddings:", gpt_embeddings.shape)


Shape of GPT embeddings: (1790, 768)


- 1969: This number represents the total sentences you have processed, with each sentence now corresponding to a single embedding vector.
- 768: Each embedding vector has 768 features. This dimensionality comes from the hidden state size of the GPT-2 model you used. Each feature in these vectors represents a dimension of the information captured about the sentence from the contextual relationship learned by the GPT model.

## Model Selection and Training


### Model Building<br>
#### - Choose a Machine Learning Algorithm:<br>
Since we have high-dimensional embeddings, an SVM can be particularly effective. SVMs are good at finding hyperplanes in high-dimensional space to separate different classes.


#### - Split the Dataset