<a href="https://colab.research.google.com/github/FredLongo/ColabTest/blob/main/Sentiment_Analysis_Guide.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Dataset Presentation
# Data Collecting: Webscraping using 'requests' and 'BeautifulSoup'

import requests
from bs4 import BeautifulSoup

# URL of the website you want to scrape
url = 'https://example.com/reviews'

# Fetch the content of the webpage
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')

    # Suppose reviews are in paragraphs with class 'review-text'
    reviews = [p.text for p in soup.find_all('p', class_='review-text')]
else:
    print("Failed to retrieve the webpage.")


Failed to retrieve the webpage.


In [None]:
# Social Media Data Collection
# Using Tweepy for Twitter API:
import tweepy

# Your Twitter API credentials
consumer_key = 'YOUR_CONSUMER_KEY'
consumer_secret = 'YOUR_CONSUMER_SECRET'
access_token = 'YOUR_ACCESS_TOKEN'
access_token_secret = 'YOUR_ACCESS_TOKEN_SECRET'

# Setting up the tweepy authorization
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

# Creating the API object
api = tweepy.API(auth)

# Searching for tweets related to a keyword (e.g., "machine learning")
tweets = api.search(q="machine learning", count=100)

# Extracting text from tweets
tweet_texts = [tweet.text for tweet in tweets]


In [None]:
# Data Cleaning
# Loading the collected data into a pandas DataFrame:

import pandas as pd

# Assuming the data collected previously are in two lists: reviews and tweet_texts
# We combine them here for demonstration purposes
all_texts = reviews + tweet_texts
df = pd.DataFrame(all_texts, columns=['text'])

In [None]:
# Handle missing values:

# Drop rows with missing text values
df.dropna(subset=['text'], inplace=True)

In [None]:
# Remove duplicates:

df.drop_duplicates(subset=['text'], inplace=True)

In [None]:
# Normalize the text data:

import re
import string

def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespaces
    text = text.strip()

    return text

df['cleaned_text'] = df['text'].apply(clean_text)


In [None]:
# Tokenize the text and remove stop words:

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# If you haven't downloaded the stopwords dataset, do it once using:
# nltk.download('stopwords')
# nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def tokenize_and_remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)

df['processed_text'] = df['cleaned_text'].apply(tokenize_and_remove_stopwords)


In [None]:
# Sentiment Analysis
# NLTK for tokenization, stemmingm and lemmatization

import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer

# If you haven't downloaded the WordNet dataset, do it once using:
# nltk.download('wordnet')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize(text):
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    return ' '.join(lemmatized_tokens)

df['lemmatized_text'] = df['processed_text'].apply(stem_and_lemmatize)

In [None]:
# TextBlob
# Returns polarity and subjectivity scores

from textblob import TextBlob

def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment

df['sentiment'] = df['lemmatized_text'].apply(get_sentiment)
df[['polarity', 'subjectivity']] = pd.DataFrame(df['sentiment'].tolist(), index=df.index)
df.drop(columns='sentiment', inplace=True)


In [None]:
# VADER
# Designed for sentiment analysis, especially for social media texts

from nltk.sentiment.vader import SentimentIntensityAnalyzer

# If you haven't downloaded the vader_lexicon, do it once using:
# nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

def get_vader_sentiment(text):
    score = sia.polarity_scores(text)
    return score['compound']

df['vader_score'] = df['lemmatized_text'].apply(get_vader_sentiment)

In [None]:
# Fake News Detection
# Data Preparation
# '1' indicates fake news and '0' indicates genuine news

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Splitting the data into training and test sets
X = df['lemmatized_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorizing the text data
tfidf_vectorizer = TfidfVectorizer(max_df=0.7)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
# Training a Classifier using Scikit-Learn
# Example using a simple Logistic Regression classifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

clf = LogisticRegression()
clf.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Deep Learning Approach using TensorFlow

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Building a simple neural network model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Training the model
history = model.fit(X_train_tfidf.toarray(), y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate on test data
loss, accuracy = model.evaluate(X_test_tfidf.toarray(), y_test)
print("Test Accuracy:", accuracy)


In [None]:
# Results and Insights
# Analyzing results using Pandas

# Quick statistical overview of sentiment scores (from previous steps)
df[['polarity', 'subjectivity', 'vader_score']].describe()

In [None]:
# Distribution of Sentiments
import matplotlib.pyplot as plt
import seaborn as sns

# Setting style
sns.set_style("whitegrid")

# Distribution of Polarity scores
plt.figure(figsize=(10, 6))
sns.histplot(df['polarity'], kde=True)
plt.title('Distribution of Polarity Scores')
plt.xlabel('Polarity Score')
plt.ylabel('Count')
plt.show()

In [None]:
# Distribution of VADER Scores

plt.figure(figsize=(10, 6))
sns.histplot(df['vader_score'], kde=True, color='green')
plt.title('Distribution of VADER Compound Scores')
plt.xlabel('VADER Compound Score')
plt.ylabel('Count')
plt.show()

In [None]:
# Distribution of Fake vs Genuine News

plt.figure(figsize=(7, 5))
sns.countplot(x='label', data=df)
plt.title('Distribution of Fake vs Genuine News')
plt.xlabel('Label (0: Genuine, 1: Fake)')
plt.ylabel('Count')
plt.show()


In [None]:
# Relationship between Sentiment and News Authenticity

plt.figure(figsize=(10, 6))
sns.boxplot(x='label', y='polarity', data=df)
plt.title('Polarity Distribution by News Authenticity')
plt.xlabel('Label (0: Genuine, 1: Fake)')
plt.ylabel('Polarity Score')
plt.show()

In [None]:
# Trends in Sentiments Over Time:

import matplotlib.pyplot as plt
import seaborn as sns

# Convert date column to datetime type (if it's not already)
df['date'] = pd.to_datetime(df['date'])

# Group by date and take the average polarity
df_date = df.groupby('date').mean().reset_index()

# Plotting the trend
plt.figure(figsize=(12, 6))
sns.lineplot(x='date', y='polarity', data=df_date)
plt.title('Trend of Average Polarity Over Time')
plt.xlabel('Date')
plt.ylabel('Average Polarity')
plt.tight_layout()
plt.show()

In [None]:
# Future Predictions Visualization

# Hypothetical future predictions
future_dates = pd.date_range(start=df['date'].max(), periods=4, freq='M')
future_polarity = [0.1, 0.15, 0.13, 0.2]  # These are just hypothetical values

future_df = pd.DataFrame({
    'date': future_dates,
    'predicted_polarity': future_polarity
})

# Merging with the original data for a continuous plot
df_date['predicted_polarity'] = df_date['polarity']  # Actual data
merged_df = pd.concat([df_date, future_df], ignore_index=True)

# Plotting the trend with future predictions
plt.figure(figsize=(12, 6))
sns.lineplot(x='date', y='predicted_polarity', data=merged_df, label='Predicted Polarity')
sns.lineplot(x='date', y='polarity', data=df_date, label='Actual Polarity')
plt.title('Trend of Polarity with Future Predictions')
plt.xlabel('Date')
plt.ylabel('Polarity')
plt.legend()
plt.tight_layout()
plt.show()