<a href="https://colab.research.google.com/github/ConfusedKlutz/Basic-Sentiment-Analysis-with-NLP-basics-/blob/main/Basic_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'imdb-dataset-sentiment-analysis-in-csv-format:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F429163%2F816060%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240714%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240714T132515Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D74b43406e4962639347c7bb690c5a1c60931ca444f56685b076974366473fa7c5ed95653f731b10384adf480170e5ceb6a6c0fabd47001bfde50cb6c23f39664713d286207ce547d6911095f77b7fbf13c40236245439f494d60b1a860dee5caf8f4a1efa71bce03f45f4cc54dc9ba7556dfd63c8693d5a8f97cb621e3b523b7f9ecaf5f4eb85932e7d961f0bceeb802adf6e7e78efd8f0843c5c4267d2fb18d495b7567ca332d8d6002162d3bb29adf967e4c36acee910be7b5f51ccfe0f566541260fab013e2e2192f8f0d49cd8248eb14f545e48f425c27db101198dc19350acaa4ec80939f2305d41000f807632238cac709e4f9b4a95f29ced549860f84'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# Introduction to Natural Language Processing (NLP)

## 1. What is NLP?

- **Definition:** Natural Language Processing (NLP) is a subfield of artificial intelligence that focuses on enabling computers to understand, interpret, and generate human language.
- **Simple Explanation:** NLP involves teaching computers to understand and communicate using human language, similar to how we interact with each other through speech or text.

## 2. Need for NLP

- **Understanding Human Language:** For computers to interact effectively with humans, they must be able to comprehend and respond to natural language inputs meaningfully.
- **Automating Repetitive Tasks:** NLP can handle tasks such as sorting emails, summarizing documents, or analyzing sentiment in large volumes of text, reducing manual effort and increasing efficiency.
- **Enhancing User Experience:** Technologies like virtual assistants, chatbots, and translation services use NLP to provide more intuitive and responsive interactions, improving overall user satisfaction.

## 3. Applications of NLP

1. **Machine Translation:** Converts text from one language to another, as seen in tools like Google Translate, making communication across languages easier.
2. **Sentiment Analysis:** Analyzes and interprets emotions expressed in text, such as social media comments or customer reviews, to gauge public opinion or customer satisfaction.
3. **Text Summarization:** Generates concise summaries of lengthy documents, helping users quickly grasp the main points without reading the entire text.
4. **Speech Recognition:** Transcribes spoken language into text, enabling voice-activated assistants like Siri or Alexa to understand and respond to verbal commands.
5. **Chatbots and Virtual Assistants:** Provides automated responses and support in customer service settings, simulating human-like interactions to address user queries and issues.

## 4. Basic Steps in NLP

1. **Text Preprocessing:** Involves cleaning and preparing raw text data by removing irrelevant information, correcting errors, and normalizing the text for analysis.
2. **Tokenization:** Breaks down text into smaller units, such as words or phrases, to facilitate analysis and processing.
3. **Removing Stop Words:** Eliminates common, less informative words (e.g., "the," "and") that do not contribute significant meaning to the analysis.
4. **Stemming and Lemmatization:** Reduces words to their base or root forms (e.g., "running" to "run") to standardize and simplify the text data.
5. **Vectorization:** Transforms text into numerical representations (vectors) that can be used by machine learning models for further analysis and processing.
6. **Model Building:** Involves creating and training machine learning models to perform tasks like classification, sentiment analysis, or translation based on the processed text data.
7. **Evaluation:** Measures the performance and accuracy of NLP models using metrics and validation techniques to ensure they meet the required objectives and provide reliable results.


In [None]:
!pip install spacy &> /dev/null
print("Spacy installed successfully")
import spacy
# spaCy is a library for natural language processing (NLP) in Python.
# spaCy is known for its ease of use, speed, and accuracy in processing and analyzing large amounts of text.

In [None]:
!pip install nltk &> /dev/null
print("NLTK installed successfully")
# NLTK (Natural Language Toolkit) is a library for natural language processing (NLP) in Python.


In [None]:
import pandas as pd
import numpy as np

### Removing Punctuations


**Importance of Removing Punctuation in NLP**

Removing punctuation is a crucial preprocessing step in NLP and text analysis. Here’s why:

1. **Improves Tokenization**
   - **Consistency:** Ensures words are tokenized uniformly.
   - **Normalization:** Treats punctuated variations of words (e.g., "word," "word.") as the same token.


2. **Reduces Noise**
   - **Focus on Content:** Punctuation doesn’t add significant meaning, so removing it helps focus on the actual words.
   - **Simplifies Analysis:** Makes tasks like word frequency counts and text classification more straightforward.


3. **Enhances Text Mining**
   - **Feature Extraction:** Eliminates irrelevant punctuation that can be treated as noise.
   - **Word Embeddings:** Ensures only meaningful words contribute to embeddings.


4. **Improves Readability and Preprocessing**
   - **Uniform Text:** Creates a consistent format for easier handling.
   - **Text Cleaning:** Removes inconsistent or excessive punctuation from scraped text.

This approach aids in more effective text analysis and enhances model performance.


* Manual Removal using Python

In [None]:
punctuations = '''!()-[]{};:"\,<>./?@#$%^&*_~`''' # Defining a Variable string with all punctuation signs

test_string = "Hello everyone !! Are you ready to dive deep into NLP (Natural Language Processing)?? "

In [None]:
test_string_with_punctuations_removed = ""
# using a for loop to remove Punctuation marks and redefine string
for char in test_string:
    if(char not in punctuations):
        test_string_with_punctuations_removed += char

In [None]:
print(test_string_with_punctuations_removed)

* Removal using REGEX

In [None]:
import re # importing REGEX Module

In [None]:
test_string = "Okay then !! Let's get into NLP (Natural Language Processing). "
test_string_with_punctuations_removed_using_Regex = re.sub(r'[^\w\s]','',test_string)

# [^\w\s]
# [] Defining the word block
# ^ Not
# \w word character
# \s space character

In [None]:
print(test_string_with_punctuations_removed_using_Regex)

### Tokenisation

Tokenization is a fundamental NLP step that splits text into smaller units called tokens, such as words or sentences. This process converts unstructured text into a structured format for analysis and processing.

### Types of Tokenization

- **Word Tokenization:** Splits text into individual words.
  - *Example:* "Tokenization is a key step in NLP." → `['Tokenization', 'is', 'a', 'key', 'step', 'in', 'NLP', '.']`
- **Sentence Tokenization:** Splits text into individual sentences.
  - *Example:* "Tokenization is a key step in NLP. It helps convert text into tokens." → `['Tokenization is a key step in NLP.', 'It helps convert text into tokens.']`
- **Subword Tokenization:** Splits text into subword units (e.g., Byte-Pair Encoding).
  - *Example:* "unhappiness" → `['un', 'happiness']`
- **Character Tokenization:** Splits text into individual characters.
  - *Example:* "Token" → `['T', 'o', 'k', 'e', 'n']`
- **N-gram Tokenization:** Creates tokens of size n from the text.
  - *Example:* "Tokenization is important." → `[(Tokenization, is), (is, important), (important, .)]`

### Applications

- **Text Classification:** Converts text into features for classification algorithms.
- **Information Retrieval:** Aids in indexing and searching text.
- **Text Analysis:** Used in sentiment analysis, entity recognition, etc.
- **Machine Translation:** Breaks text into manageable units for translation.
- **Language Modeling:** Helps predict the next word or sequence.

### Challenges

- **Ambiguity:** Handling contractions and language ambiguities.
- **Special Characters:** Properly managing punctuation and special characters.
- **Different Languages:** Adapting tokenization rules for various languages.


Tokenization is essential for preparing text data for analysis and enables more advanced text understanding and processing.


* **With Python**

In [None]:
tokenising_python = "Lets use our Python basics to tokenise our sentence"
tokenising_python.split(" ")

* **using NLTK**

In [None]:
import nltk
nltk.download('punkt') # Punkt is a sentence tokenizer

In [None]:
tokenisning_example = "Now, lets tokenise using a library"
nltk.word_tokenize(tokenisning_example)

# Removing Stop words

Removing stop words is a preprocessing step in NLP that involves filtering out common words that don't add significant meaning to text analysis. These include articles, prepositions, conjunctions, and frequent pronouns.

### Why Remove Stop Words?

- **Reduce Noise:**
  - **Focus on Meaningful Words:** Stop words don't contribute much semantic value, so removing them highlights more important words.
  - **Improve Accuracy:** Reduces irrelevant information, enhancing text analysis performance.


- **Simplify Analysis:**
  - **Feature Reduction:** Reduces text data dimensionality, simplifying tasks like classification and clustering.
  - **Improve Efficiency:** Fewer tokens lead to faster processing and more efficient algorithms.


- **Enhance Model Performance:**
  - **Text Classification:** Models trained on cleaner data can better focus on informative features.
  - **Search Engines:** Improves search results by concentrating on relevant keywords.

### Common Stop Words

- **Articles:** a, an, the
- **Prepositions:** in, on, at, of
- **Conjunctions:** and, or, but
- **Pronouns:** he, she, it, they
- **Others:** is, are, was, has

Removing stop words helps in focusing on the core content of the text, leading to better analysis and model performance.


In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

In [None]:
text_stop_words = "I trust my preperations for evey aspect of work"
words = text_stop_words.split(" ")

In [None]:
no_stop_words = [word for word in words if word not in stop_words]
no_stop_words

In [None]:
stop_words.append("work")

In [None]:
no_stop_words = [word for word in words if word not in stop_words]
no_stop_words

# Stemming

Stemming is a text normalization technique in NLP that reduces words to their root form by stripping suffixes. This process helps standardize text for further analysis.

## Why Use Stemming?

- **Reduce Variability:**
  - **Normalization:** Different forms of a word (e.g., "running," "runner") are reduced to a common base form (e.g., "run").
  - **Improved Matching:** Enhances search and retrieval by standardizing word variations.

- **Simplify Analysis:**
  - **Feature Reduction:** Reduces the dimensionality of text data, improving machine learning model performance and analysis efficiency.

- **Enhance Search and Indexing:**
  - **Consistency:** Ensures variations of a word are treated as the same word, benefiting search engines and information retrieval.

## How Stemming Works

Stemming algorithms remove prefixes and suffixes from words using predefined rules or lookup tables to derive root forms.

### Popular Stemming Algorithms

- **Porter Stemmer:**
  - **Description:** Widely used, applies a set of rules to iteratively remove suffixes.


- **Lancaster Stemmer:**
  - **Description:** More aggressive, applies more rules, often resulting in shorter stems.


- **Snowball Stemmer:**
  - **Description:** Improved version of Porter Stemmer, supports multiple languages with more sophisticated rules.

## Advantages of Stemming

- **Consistency:** Reduces variations to a common base form.
- **Efficiency:** Simplifies text, reducing unique tokens and speeding up processing.
- **Improved Matching:** Enhances ability to match words with similar meanings.

## Disadvantages of Stemming

- **Over-Stemming:** May lead to loss of meaning by reducing words to overly simplistic forms.
- **Irregular Stems:** Can produce inconsistent or confusing root forms.
- **Language Dependence:** Often language-specific and may not work well with all languages.

## Summary

Stemming normalizes words by reducing them to their base forms, aiding in text simplification and analysis. While it improves efficiency and consistency, it can also result in loss of meaning and irregularities. Understanding these aspects helps in applying stemming effectively in NLP tasks.


In [None]:
from nltk.stem import PorterStemmer

In [None]:
stem_input = "Stemming breaks words in to their base forms. The working of stemmer is predefined using various algorithms"

In [None]:
stemmer = PorterStemmer()
stem_input = nltk.word_tokenize(stem_input)
for word in stem_input:
    print(stemmer.stem(word))

# Lemmatisation

Lemmatization is a text normalization technique in NLP that reduces words to their base or dictionary form, known as a lemma. Unlike stemming, which applies heuristic rules, lemmatization uses a linguistic approach considering the word's context and part of speech.

### Why Use Lemmatization?

- **Meaningful Base Forms:**
  - **Context-Aware:** Considers the word's context and part of speech to derive valid dictionary words.
  - **Improved Accuracy:** Preserves the semantic meaning of words.


- **Enhanced Text Analysis:**
  - **Consistency:** Reduces different inflections of a word to a common base form.
   - **Better Representation:** Creates more meaningful text representations by retaining semantic integrity.


- **Search and Retrieval:**
  - **Effective Matching:** Helps match word variations, benefiting search engines and information retrieval systems.

### How Lemmatization Works

Lemmatization algorithms use dictionaries or rules to find the base form of a word, involving:

- **Part-of-Speech Tagging:** Identifies the grammatical category of the word (e.g., noun, verb) to determine its lemma.
- **Contextual Analysis:** Considers the word's usage context for accurate base form determination.

### Popular Lemmatization Algorithms

- **WordNet Lemmatizer:**
  - **Description:** Uses WordNet lexical database for base forms based on part of speech.
  
  
- **spaCy Lemmatizer:**
  - **Description:** Part of spaCy's NLP pipeline, providing advanced lemmatization.
  
## Advantages of Lemmatization

- **Contextual Accuracy:** Maintains semantic meaning and provides accurate base forms.
- **Reduced Variability:** Converts different word forms to a standardized form.
- **Better for NLP Models:** Offers a more interpretable text representation, improving model performance.

## Disadvantages of Lemmatization

- **Computational Complexity:** More resource-intensive compared to stemming.
- **Requires POS Tagging:** Accurate lemmatization often needs part-of-speech tagging.
- **Dependency on Lexical Resources:** Effectiveness depends on the quality of lexical resources like WordNet.

## Summary

Lemmatization reduces words to their dictionary forms, considering linguistic rules and context. It improves text analysis by preserving meaning and providing a more accurate representation, though it can be more complex and resource-demanding than stemming.


In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
nlp = spacy.load('en_core_web_sm')

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
lemm_input = "Stemming breaks words in to their base forms. The working of stemmer is predefined using various algorithms"

In [None]:
nltk_tokens = nltk.word_tokenize(lemm_input)
nltk_tokens_str = ' '.join(nltk_tokens)
print(nltk_tokens)

# Process the string with spaCy
doc = nlp(nltk_tokens_str)
spacy_lemmatized_tokens = [token.lemma_ for token in doc]

print("Lemmatized tokens using spaCy:", spacy_lemmatized_tokens)


# Bag of Words (BoW)

Bag of Words (BoW) is a fundamental method used in natural language processing (NLP) to convert text data into numerical features. It simplifies text by representing it as a fixed-length vector of word counts or frequencies, enabling its use in various machine learning tasks.

### How Bag of Words Works

1. **Tokenization**:
   - **Process**: Break the text into individual words or tokens. Example: "The cat sat on the mat" → `['The', 'cat', 'sat', 'on', 'the', 'mat']`.


2. **Vocabulary Creation**:
   - **Process**: Create a vocabulary of all unique tokens from the corpus. Each unique word is assigned an index.


3. **Feature Vector Creation**:
   - **Process**: Represent each document as a vector, where each dimension corresponds to a word in the vocabulary, with values indicating word counts or frequencies.

### Example

For a simple corpus with three documents:

1. "The cat sat on the mat."
2. "The dog barked at the cat."
3. "The cat chased the dog."

**Step-by-Step Process:**

1. **Tokenization**:
   - Document 1: `['The', 'cat', 'sat', 'on', 'the', 'mat']`
   - Document 2: `['The', 'dog', 'barked', 'at', 'the', 'cat']`
   - Document 3: `['The', 'cat', 'chased', 'the', 'dog']`


2. **Vocabulary Creation**:
   - Vocabulary: `['The', 'cat', 'sat', 'on', 'mat', 'dog', 'barked', 'at', 'chased']`


3. **Feature Vector Creation**:
   - Document 1: `[2, 1, 1, 1, 1, 0, 0, 0, 0]`
   - Document 2: `[2, 1, 0, 0, 0, 1, 1, 1, 0]`
   - Document 3: `[2, 1, 0, 0, 0, 1, 0, 0, 1]`

### Advantages of Bag of Words

1. **Simplicity**:
   - **Easy to Implement**: BoW is straightforward and a good starting point for text classification.


2. **Effectiveness**:
   - **Works Well for Many Tasks**: Effective for text classification and sentiment analysis.


3. **Feature Extraction**:
   - **Generates Numerical Data**: Converts text data into numerical features for machine learning algorithms.

### Disadvantages of Bag of Words

1. **Loss of Context**:
   - **Ignores Word Order**: Does not capture word order or context.

2. **High Dimensionality**:
   - **Large Feature Vectors**: Vocabulary size can lead to high-dimensional and sparse vectors.

3. **Vocabulary Size**:
   - **Fixed Size**: May cause data sparsity in large corpora.

4. **No Semantics**:
   - **Lacks Meaning**: Does not capture semantic relationships between words.

In [None]:
from sklearn. feature_extraction.text import CountVectorizer
doc = "In the-state-of-art of the NLP field, Embedding is the \
success way to resolve text related problem and outperform \ Bag of Words ( BOW ). Indeed, BoW introduced limitations \ large feature dimension, sparse representation etc."
count_vec = CountVectorizer ()
count_occurs = count_vec.fit_transform([doc])

In [None]:
count_vec = CountVectorizer()
count_occurs = count_vec.fit_transform([doc])
count_occur_df = pd.DataFrame(
    {'Word': count_vec.get_feature_names_out(), 'Count': count_occurs.toarray().tolist()[0]}
)
count_occur_df.sort_values('Count', ascending=False, inplace=True)
print(count_occur_df.head())

In [None]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
x = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names_out())
print(x.toarray())


In [None]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

vectorizer_n = CountVectorizer(analyzer='word', ngram_range=(1, 3))
x = vectorizer_n.fit_transform(corpus)
print(vectorizer_n.get_feature_names_out())
print(x.toarray())


# TF-IDF (Term Frequency-Inverse Document Frequency)

TF-IDF is a statistical measure used in natural language processing (NLP) and information retrieval to assess the importance of a word in a document relative to a collection of documents or corpus. It combines two metrics: Term Frequency (TF) and Inverse Document Frequency (IDF).

### How TF-IDF Works

#### 1. Term Frequency (TF)
- **Definition**: Measures the frequency of a term (word) in a document.
- **Calculation**:
  \[
  \text{TF}(t, d) = \frac{\text{Number of times term } t \text{ appears in document } d}{\text{Total number of terms in document } d}
  \]

#### 2. Inverse Document Frequency (IDF)
- **Definition**: Measures the importance of a term across a collection of documents.
- **Calculation**:
  \[
  \text{IDF}(t, D) = \log \left(\frac{N}{|\{d \in D : t \in d\}|} + 1 \right) + 1
  \]
  - Where \( N \) is the total number of documents in the corpus.
  - \( |\{d \in D : t \in d\}| \) is the number of documents containing the term \( t \).

#### 3. TF-IDF Calculation
- **Definition**: Quantifies the relevance of a term in a document by multiplying its TF value by its IDF value.
- **Calculation**:
  \[
  \text{TF-IDF}(t, d, D) = \text{TF}(t, d) \times \text{IDF}(t, D)
  \]


**Step-by-Step TF-IDF Calculation**:

1. **Tokenization**: Tokenize each document into words.
2. **TF Calculation**: Compute the term frequency for each term in each document.
3. **IDF Calculation**: Compute the inverse document frequency for each term across the corpus.
4. **TF-IDF Calculation**: Multiply TF by IDF to get the TF-IDF score for each term in each document.

### Advantages of TF-IDF

1. **Term Importance**:
   - Highlights terms that are specific to a document and less common across the corpus.

2. **Flexible Weighting**:
   - Adjusts the importance of terms based on their frequency in the document and across the corpus.

3. **Reduces Noise**:
   - Filters out common terms that appear frequently across all documents.

### Disadvantages of TF-IDF

1. **Contextual Understanding**:
   - Lacks semantic understanding and does not capture the context of language usage.

2. **Normalization**:
   - Sensitive to document length and term frequency distributions.


In [None]:
import math
# Input documents
documentA = 'The rain is pouring'
documentB = 'The rain has stopped'

# Split the documents into words
bagOfWordsA = documentA.lower().split()
bagOfWordsB = documentB.lower().split()

# Create a set of unique words
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

# Initialize dictionaries to count word occurrences
numOfWordsA = dict.fromkeys(uniqueWords, 0)
numOfWordsB = dict.fromkeys(uniqueWords, 0)

In [None]:
# Count word occurrences in both documents
for word in bagOfWordsA:
    numOfWordsA[word] += 1

for word in bagOfWordsB:
    numOfWordsB[word] += 1

In [None]:
# Function to compute Term Frequency (TF)
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [None]:
# Compute TF for both documents
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

In [None]:
# Function to compute Inverse Document Frequency (IDF)
def computeIDF(docList):
    idfDict = {}
    N = len(docList)

    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1

    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

# Compute IDF for the corpus
idfs = computeIDF([numOfWordsA, numOfWordsB])

In [None]:
# Function to compute TF-IDF
def computeTFIDF(tfDict, idfDict):
    tfidf = {}
    for word, val in tfDict.items():
        tfidf[word] = val * idfDict[word]
    return tfidf

# Compute TF-IDF for both documents
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

In [None]:
# Print the results
# print("TF-IDF for Document A:", tfidfA)
# print("TF-IDF for Document B:", tfidfB)
df = pd.DataFrame([tfidfA,tfidfB])
df

* **Direct use of Model**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the corpus
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the corpus
tfidf_matrix = vectorizer.fit_transform(corpus)

# Get the feature names
feature_names = vectorizer.get_feature_names_out()
print("Feature Names:", feature_names)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
print("\nTF-IDF DataFrame:\n", tfidf_df)


In [None]:
# Get the feature names
feature_names = vectorizer.get_feature_names_out()

# Print the feature names
print("Feature Names:", feature_names)

# Print the document-term matrix
print("Document-Term Matrix:\n", x.toarray())

# Word2Vec

Word2Vec is a technique in natural language processing (NLP) for representing words as dense vectors in a continuous vector space. It captures semantic meanings and relationships between words based on their context in a corpus.

### Key Concepts

- **Word Embeddings**: Dense vector representations of words.
- **Training Objective**: Position similar words close together in the vector space based on context.

### Algorithms

1. **Continuous Bag of Words (CBOW)**:
   - **Objective**: Predict a target word from its context.
   - **Approach**: Average context word vectors to predict the target word vector.


2. **Skip-gram**:
   - **Objective**: Predict context words from a target word.
   - **Approach**: Use target word vector to predict context word vectors.

### How It Works

1. **Tokenization**: Split text into words.
2. **Training**: Train CBOW or Skip-gram on a large text corpus to learn word vectors.
3. **Embedding Retrieval**: Extract word vectors for NLP tasks.

### Advantages

- **Semantic Similarity**: Captures word relationships (e.g., "king" - "man" + "woman" ≈ "queen").
- **Dimensionality Reduction**: Produces dense vectors.
- **Contextual Understanding**: Reflects word meanings based on context.

### Disadvantages

- **Fixed Context Window Size**: May miss relevant context.
- **Computational Cost**: Training can be expensive.
- **Out of Vocabulary (OOV)**: Words not seen during training are not represented.


In [None]:
from gensim.models import Word2Vec
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

# Tokenize each document in the corpus
tokenized_corpus = [nltk.word_tokenize(doc.lower()) for doc in corpus]

# Train Word2Vec model
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
word = 'document'
if word in model.wv:
    vector = model.wv[word]
    print("Vector for 'document':", vector)
else:
    print(f"'{word}' not found in the model vocabulary.")

# Basic Model for Sentiment Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
train_data = pd.read_csv("/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Train.csv")
test_data = pd.read_csv("/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Test.csv")

In [None]:
train_data.head()

In [None]:
# Text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Tokenize using NLTK's Punkt tokenizer
    tokens = nltk.word_tokenize(text)

    # Remove punctuation and non-alphabetic tokens
    tokens = [token for token in tokens if token.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens

In [None]:
train_data['processed_text'] = train_data['text'].apply(preprocess_text)
test_data['processed_text'] = test_data['text'].apply(preprocess_text)

In [None]:
train_data.head()

In [None]:
w2v_model = Word2Vec(sentences=train_data['processed_text'], vector_size=100, window=5, min_count=1, workers=4)

In [None]:
# Function to create document vectors
def document_vector(doc):
    doc_vector = np.zeros(100)  # 100 is the vector size we chose for Word2Vec
    count = 0
    for word in doc:
        if word in w2v_model.wv:
            doc_vector += w2v_model.wv[word]
            count += 1
    if count != 0:
        doc_vector /= count
    return doc_vector

# Create document vectors for train and test data
X_train = np.array(train_data['processed_text'].apply(document_vector).tolist())
X_test = np.array(test_data['processed_text'].apply(document_vector).tolist())

In [None]:
y_train = train_data['label']
y_test = test_data['label']

In [None]:
# Train logistic regression model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)

# Make predictions
y_pred = lr_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Plot sentiment distribution
plt.figure(figsize=(8, 6))
train_data['label'].value_counts().plot(kind='bar')
plt.title('Sentiment Distribution in Training Data')
plt.xlabel('Sentiment (0: Negative, 1: Positive)')
plt.ylabel('Count')
plt.show()
