<a href="https://colab.research.google.com/github/Ali-Beg/Ali-Beg/blob/main/nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nltk



In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk. stem import PorterStemmer
from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize
import warnings
warnings.filterwarnings('ignore')

nltk.download('punkt')
nltk.download('stopwords')
text = "Quick Brown little fox jumps over a littledog"
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if
word.lower() not in
stop_words]
porter = PorterStemmer()
stemmed_tokens = [porter.stem(word) for word in filtered_tokens]
fdist = FreqDist (stemmed_tokens)
print(fdist.most_common(5))
sentences = sent_tokenize(text)
print (sentences)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[('quick', 1), ('brown', 1), ('littl', 1), ('fox', 1), ('jump', 1)]
['Quick Brown little fox jumps over a littledog']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**Implement morphological parser to accept and reject given string**

In [None]:
import re
def morphological_parser(string) :
# Morphological rules
# Rule 1: String must start with a vowel
  if re.match(r'^[aeiouAEIOU]', string):
# Rule 2: String must end with a consonant
    if re.match(r'.*[bcdfghjkImnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ]$',string):
# Rule 3: String must have at Least 4 characters
      if len(string) >= 4:
        return True
      else:
        return False, "Rejected: String must have at least 4 character"
    else:
      return False, "Rejected: String must end with a consonant"
  else:
    return False, "Rejected: String must start with a vowel"
# Test the parser
input_string = input("Enter string: ")
accepted, reason = morphological_parser (input_string)
if accepted:
  print("Accepted!")
else:
  print(reason)

Enter string:  an elder is always wise perso
Rejected: String must start with a vowel


# Implement stemming and lemmatization for a corpus.

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download NLTK resources (punkt for sentence tokenization, wordnet for lemmatization)
nltk.download('punkt')
nltk.download('wordnet')

# Sample corpus
corpus = [
    "The quick brown foxes are jumping over the lazy dogs",
    "I am happily studying natural language processing",
    "The weather is quite pleasant today"
]

# Tokenize the corpus (split sentences into words)
tokenized_corpus = [word_tokenize(sentence) for sentence in corpus]

# Stemming (reduce words to their base form using PorterStemmer)
porter = PorterStemmer()
stemmed_corpus = [[porter.stem(word) for word in sentence] for sentence in tokenized_corpus]

print("Stemmed Corpus:")
for sentence in stemmed_corpus:
    print(sentence)  # Print each stemmed sentence

# Lemmatization (reduce words to their dictionary form using WordNetLemmatizer)
lemmatizer = WordNetLemmatizer()
lemmatized_corpus = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in tokenized_corpus]

print("Lemmatized Corpus:")
for sentence in lemmatized_corpus:
    print(sentence)  # Print each lemmatized sentence


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


Stemmed Corpus:
['the', 'quick', 'brown', 'fox', 'are', 'jump', 'over', 'the', 'lazi', 'dog']
['i', 'am', 'happili', 'studi', 'natur', 'languag', 'process']
['the', 'weather', 'is', 'quit', 'pleasant', 'today']
Lemmatized Corpus:
['The', 'quick', 'brown', 'fox', 'are', 'jumping', 'over', 'the', 'lazy', 'dog']
['I', 'am', 'happily', 'studying', 'natural', 'language', 'processing']
['The', 'weather', 'is', 'quite', 'pleasant', 'today']


# Perform and analyse POS tagging -HMM

In [None]:
import nltk
from nltk.corpus import treebank
from nltk.tag import hmm

# Download the Treebank corpus (ensure internet connection)
nltk.download('treebank')

# Load the tagged sentences
tagged_sentences = treebank.tagged_sents()

# Split into training and testing sets (80% training, 20% testing)
train_size = int(0.8 * len(tagged_sentences))
train_sents = tagged_sentences[:train_size]
test_sents = tagged_sentences[train_size:]

# Train the HMM tagger
trainer = hmm.HiddenMarkovModelTrainer()
tagger = trainer.train(train_sents)

# Evaluate the tagger's accuracy on the test set
accuracy = tagger.evaluate(test_sents)
print("Accuracy:", accuracy)

# New sentence for tagging
new_sentence = "The cat is sitting on the mat"

# Tokenize the sentence
tokenized_sentence = nltk.word_tokenize(new_sentence)

# Tag the tokenized sentence using the trained HMM tagger
tagged_sentence = tagger.tag(tokenized_sentence)

# Print the tagged sentence
print("Tagged Sentence:", tagged_sentence)


[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


Accuracy: 0.3647387594191327
Tagged Sentence: [('The', 'DT'), ('cat', 'NNP'), ('is', 'NNP'), ('sitting', 'NNP'), ('on', 'NNP'), ('the', 'NNP'), ('mat', 'NNP')]


# Implement the viterbi algorithm Using python or NLTK

In [None]:
import numpy as np


def viterbi(observations, transition_matrix, emission_matrix):


  num_hidden_states = transition_matrix.shape[0]
  num_observations = len(observations)

  # Initialize the Viterbi path probabilities and backpointers
  viterbi_path = np.zeros((num_observations, num_hidden_states))
  backpointers = np.zeros((num_observations, num_hidden_states), dtype=int)

  # Initialization step (t=1)
  for state in range(num_hidden_states):
    viterbi_path[0, state] = emission_matrix[state, observations[0]]

  # Forward pass (t=2 to T)
  for t in range(1, num_observations):
    for state in range(num_hidden_states):
      max_prob = float('-inf')
      for prev_state in range(num_hidden_states):
        prev_prob = viterbi_path[t - 1, prev_state]
        current_prob = prev_prob * transition_matrix[prev_state, state] * emission_matrix[state, observations[t]]
        if current_prob > max_prob:
          max_prob = current_prob
          backpointers[t, state] = prev_state
      viterbi_path[t, state] = max_prob

  # Backtracking to find the most likely sequence
  hidden_states = []
  state = np.argmax(viterbi_path[-1, :])
  for t in range(num_observations)[::-1]:
    hidden_states.append(state)
    state = backpointers[t, state]
  hidden_states.reverse()

  # Calculate the probability of the most likely sequence
  best_path_prob = viterbi_path[-1, state]

  return hidden_states, best_path_prob


# Example usage (replace with your actual HMM parameters)
transition_matrix = np.array([[0.7, 0.3], [0.4, 0.6]])
emission_matrix = np.array([[0.8, 0.2], [0.1, 0.9]])
observations = [1, 0, 1]
hidden_states, best_path_prob = viterbi(observations, transition_matrix, emission_matrix)

print("Most likely sequence of hidden states:", hidden_states)
print("Probability of the most likely sequence:", best_path_prob)


Most likely sequence of hidden states: [1, 0, 1]
Probability of the most likely sequence: 0.04032


# Lab 6 Implement a bigram model using 3 sentences in python orNLTK

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Sample sentences
sentences = [
    "I love natural language processing",
    "The quick brown fox jumps over the lazy dog",
    "NLTK is a powerful toolkit for natural language processing tasks"
]

# Tokenize sentences into lowercase words
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Create bigrams for each sentence
sentence_bigrams = [list(nltk.bigrams(tokens)) for tokens in tokenized_sentences]

# Build bigram model (dictionary with nested dictionaries)
bigram_model = {}
for sentence_bigram in sentence_bigrams:
    for bigram in sentence_bigram:
        # First word in the bigram (key)
        first_word = bigram[0]

        # Check if the first word exists as a key in the bigram_model
        if first_word not in bigram_model:
            bigram_model[first_word] = {}  # Create an empty inner dictionary

        # Second word in the bigram (value) and its count
        second_word = bigram[1]
        count = bigram_model[first_word].get(second_word, 0)  # Get existing count or default to 0
        bigram_model[first_word][second_word] = count + 1  # Update or create new count

# Print the bigram model
for word in bigram_model:
    print(word, ":", bigram_model[word])


i : {'love': 1}
love : {'natural': 1}
natural : {'language': 2}
language : {'processing': 2}
the : {'quick': 1, 'lazy': 1}
quick : {'brown': 1}
brown : {'fox': 1}
fox : {'jumps': 1}
jumps : {'over': 1}
over : {'the': 1}
lazy : {'dog': 1}
nltk : {'is': 1}
is : {'a': 1}
a : {'powerful': 1}
powerful : {'toolkit': 1}
toolkit : {'for': 1}
for : {'natural': 1}
processing : {'tasks': 1}


# lab 7 Text classification using Naïve Bayes Classifier

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Sample data (replace with your actual dataset)
corpus = [
    ("This is a good movie", "positive"),
    ("I enjoyed this movie a lot", "positive"),
    ("This movie is fantastic", "positive"),
    ("What a waste of time", "negative"),
    ("I wouldn't recommend this movie", "negative"),
    ("Terrible acting", "negative")
]

# Split data into features (X) and labels (y)
X, y = zip(*corpus)  # Unpack tuples into separate lists of features and labels

# Vectorize the text data
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)  # Set random_state for reproducibility

# Train Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.5
Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         0
    positive       1.00      0.50      0.67         2

    accuracy                           0.50         2
   macro avg       0.50      0.25      0.33         2
weighted avg       1.00      0.50      0.67         2



# Lab 8 Sentiment analysis using SVM

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Sample data (replace with your actual dataset)
corpus = [
    ("This is a good movie", "positive"),
    ("I enjoyed this movie a lot", "positive"),
    ("This movie is fantastic", "positive"),
    ("What a waste of time", "negative"),
    ("I wouldn't recommend this movie", "negative"),
    ("Terrible acting", "negative")
]

# Split data into features (X) and labels (y)
X, y = zip(*corpus)  # Unpack tuples into separate lists

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)  # Set random_state for reproducibility

# Train SVM classifier with linear kernel
classifier = SVC(kernel='linear')  # You can experiment with different kernels (e.g., 'rbf', 'poly')

# Train the classifier
classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.0
Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       0.0
    positive       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



# Design of ANN for email spam classification

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow as tf
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download nltk data
nltk.download('stopwords')
nltk.download('wordnet')

# Sample data (replace with your actual dataset)
emails = [
    "This is a good movie",
    "I enjoyed this movie a lot",
    "This movie is fantastic",
    "What a waste of time",
    "I wouldn't recommend this movie",
    "Terrible acting"
]
labels = [0, 0, 0, 1, 1, 1]  # 0 for positive, 1 for negative

# Preprocess the text data
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.lower()  # Convert to lowercase
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

emails = [preprocess_text(email) for email in emails]

# Convert emails into numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(emails)
y = np.array(labels)  # Convert labels to a NumPy array

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert sparse matrices to dense format for compatibility with Keras
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Define the neural network model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_dense.shape[1],)),  # Input layer with 128 neurons and ReLU activation
    Dropout(0.5),  # Dropout layer with 50% dropout rate to prevent overfitting
    Dense(64, activation='relu'),  # Hidden layer with 64 neurons and ReLU activation
    Dropout(0.5),  # Dropout layer with 50% dropout rate
    Dense(32, activation='relu'),  # Hidden layer with 32 neurons and ReLU activation
    Dropout(0.5),  # Dropout layer with 50% dropout rate
    Dense(1, activation='sigmoid')  # Output layer with 1 neuron and sigmoid activation for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_dense, y_train, epochs=20, batch_size=16, verbose=1)

# Evaluate the model on the testing set
y_pred = (model.predict(X_test_dense) > 0.5).astype("int32")  # Threshold predictions to 0 or 1 based on a probability cutoff of 0.5
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy: 0.0
