In [None]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_imdb_reviews(url, num_pages):
    reviews = []
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    for page in range(num_pages):
        response = requests.get(f"{url}?start={page*10}", headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        for review in soup.find_all('div', class_='text show-more__control'):
            body = review.text.strip()
            reviews.append({'body': body})

    return reviews
# List of movie URLs and their titles
movies = [
    {'url': 'https://www.imdb.com/title/tt2560140/reviews?ref_=tt_urv', 'title': 'Attack on Titan'},
    {'url': 'https://www.imdb.com/title/tt0944947/reviews?ref_=tt_urv', 'title': 'Game of Thrones'},
    # Add more movies as needed
]

num_pages = 5  # Adjust the number of pages to scrape

with open('imdb_reviews.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['title', 'body']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for movie in movies:
        reviews = scrape_imdb_reviews(movie['url'], num_pages)
        for review in reviews:
            review['title'] = movie['title']
            writer.writerow(review)

In [None]:
import pandas as pd

# Load the data
df = pd.read_csv('imdb_reviews_Series.csv')
print(df.head())

             title                                               body
0  Attack on Titan  Attack On Titan is officially over. And nothin...
1  Attack on Titan  The moment you watch this audiovisual masterpi...
2  Attack on Titan  Before I started watching this show, I couldn'...
3  Attack on Titan  I'm a more frequent American TV shows watcher ...
4  Attack on Titan  Growing up in the 80's 90's during the time of...


In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Convert to lowercase
    tokens = [token.lower() for token in tokens]
    # Remove stopwords
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    # Join tokens back into a single string
    return ' '.join(tokens)

df['cleaned_body'] = df['body'].apply(preprocess_text)
print(df.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


             title                                               body  \
0  Attack on Titan  Attack On Titan is officially over. And nothin...   
1  Attack on Titan  The moment you watch this audiovisual masterpi...   
2  Attack on Titan  Before I started watching this show, I couldn'...   
3  Attack on Titan  I'm a more frequent American TV shows watcher ...   
4  Attack on Titan  Growing up in the 80's 90's during the time of...   

                                        cleaned_body  
0  attack titan officially nothing ever samewhen ...  
1  moment watch audiovisual masterpiece immediate...  
2  started watching show couldnt imagine rated hi...  
3  im frequent american tv shows watcher anime wa...  
4  growing time top anime stories always getting ...  


In [None]:
from textblob import TextBlob

def analyze_sentiment(text):
    analysis = TextBlob(text)
    # Determine the sentiment
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity == 0:
        return 'Neutral'
    else:
        return 'Negative'

df['sentiment'] = df['cleaned_body'].apply(analyze_sentiment)
print(df.head())

             title                                               body  \
0  Attack on Titan  Attack On Titan is officially over. And nothin...   
1  Attack on Titan  The moment you watch this audiovisual masterpi...   
2  Attack on Titan  Before I started watching this show, I couldn'...   
3  Attack on Titan  I'm a more frequent American TV shows watcher ...   
4  Attack on Titan  Growing up in the 80's 90's during the time of...   

                                        cleaned_body sentiment  
0  attack titan officially nothing ever samewhen ...  Negative  
1  moment watch audiovisual masterpiece immediate...  Positive  
2  started watching show couldnt imagine rated hi...  Positive  
3  im frequent american tv shows watcher anime wa...  Positive  
4  growing time top anime stories always getting ...  Positive  


In [None]:
# Display the results
print(df[['title','body', 'sentiment']])

# Save the results to a new CSV file
df.to_csv('imdb_reviews_with_sentiments_Series.csv', index=False)

               title                                               body  \
0    Attack on Titan  Attack On Titan is officially over. And nothin...   
1    Attack on Titan  The moment you watch this audiovisual masterpi...   
2    Attack on Titan  Before I started watching this show, I couldn'...   
3    Attack on Titan  I'm a more frequent American TV shows watcher ...   
4    Attack on Titan  Growing up in the 80's 90's during the time of...   
..               ...                                                ...   
365  Game of Thrones  Readers who come to this review later in the s...   
366  Game of Thrones  Started off as the greatest series of all time...   
367  Game of Thrones  I was a big fan of Game of Thrones ever since ...   
368  Game of Thrones  Do not believe any of those negative reviews. ...   
369  Game of Thrones  A series like never seen before which rocked l...   

    sentiment  
0    Negative  
1    Positive  
2    Positive  
3    Positive  
4    Positive  
.. 

In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from textblob import TextBlob
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
# Load the preprocessed data with sentiment analysis by TextBlob
df = pd.read_csv('imdb_reviews_with_sentiments_Series.csv')
# Remove neutral sentiments for binary classification
df = df[df['sentiment'] != 'Neutral']
# Map sentiment to binary labels
df['sentiment'] = df['sentiment'].map({'Positive': 1, 'Negative': 0})
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_body'], df['sentiment'], test_size=0.2, random_state=42)

# Vectorize text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Define models to compare
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": SVC(kernel='linear')
}

# Train and evaluate each model
results = {}
for model_name, model in models.items():
    # Train the model
    model.fit(X_train_vec, y_train)
    # Predict the sentiment of the test set
    y_pred = model.predict(X_test_vec)
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)

    results[model_name] = {
        "accuracy": accuracy,
        "classification_report": report,
        "confusion_matrix": confusion
    }

# Print the results for each model
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['accuracy']}")
    print("Classification Report:")
    print(metrics['classification_report'])
    print("Confusion Matrix:")
    print(metrics['confusion_matrix'])
    print("\n")

# Compare with TextBlob results
textblob_sentiments = X_test.apply(lambda x: 1 if TextBlob(x).sentiment.polarity > 0 else 0)
textblob_accuracy = accuracy_score(y_test, textblob_sentiments)

print("TextBlob Accuracy:", textblob_accuracy)

Model: Naive Bayes
Accuracy: 0.971830985915493
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.60      0.75         5
           1       0.97      1.00      0.99        66

    accuracy                           0.97        71
   macro avg       0.99      0.80      0.87        71
weighted avg       0.97      0.97      0.97        71

Confusion Matrix:
[[ 3  2]
 [ 0 66]]


Model: Logistic Regression
Accuracy: 0.9295774647887324
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.93      1.00      0.96        66

    accuracy                           0.93        71
   macro avg       0.46      0.50      0.48        71
weighted avg       0.86      0.93      0.90        71

Confusion Matrix:
[[ 0  5]
 [ 0 66]]


Model: Support Vector Machine
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   suppor

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from textblob import TextBlob

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the preprocessed data with sentiment analysis by TextBlob
df = pd.read_csv('imdb_reviews_with_sentiments.csv')

# Remove neutral sentiments for binary classification
df = df[df['sentiment'] != 'Neutral']

# Map sentiment to binary labels
df['sentiment'] = df['sentiment'].map({'Positive': 1, 'Negative': 0})

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_body'], df['sentiment'], test_size=0.2, random_state=42)

# Vectorize text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Define models to compare
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": SVC(kernel='linear')
}

# Train and evaluate each model
results = {}
for model_name, model in models.items():
    # Train the model
    model.fit(X_train_vec, y_train)
    # Predict the sentiment of the test set
    y_pred = model.predict(X_test_vec)
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)

    results[model_name] = {
        "accuracy": accuracy,
        "classification_report": report,
        "confusion_matrix": confusion
    }

# Print the results for each model
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"Accuracy: {metrics['accuracy']}")
    print("Classification Report:")
    print(metrics['classification_report'])
    print("Confusion Matrix:")
    print(metrics['confusion_matrix'])
    print("\n")

# Compare with TextBlob results
textblob_sentiments = X_test.apply(lambda x: 1 if TextBlob(x).sentiment.polarity > 0 else 0)
textblob_accuracy = accuracy_score(y_test, textblob_sentiments)

print("TextBlob Accuracy:", textblob_accuracy)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: Naive Bayes
Accuracy: 0.8913043478260869
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.17      0.29         6
           1       0.89      1.00      0.94        40

    accuracy                           0.89        46
   macro avg       0.94      0.58      0.61        46
weighted avg       0.90      0.89      0.86        46

Confusion Matrix:
[[ 1  5]
 [ 0 40]]


Model: Logistic Regression
Accuracy: 0.8695652173913043
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.87      1.00      0.93        40

    accuracy                           0.87        46
   macro avg       0.43      0.50      0.47        46
weighted avg       0.76      0.87      0.81        46

Confusion Matrix:
[[ 0  6]
 [ 0 40]]


Model: Support Vector Machine
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   suppo