In [None]:
import json
import nltk
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
import joblib

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize necessary components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
global_freq_dist = FreqDist()

def get_synonyms(word):
    """Retrieve synonyms for a given word using WordNet."""
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return synonyms

def preprocess_text(text):
    """Tokenize text, remove stopwords, lemmatize, and count frequencies."""
    word_tokens = word_tokenize(text)
    processed_tokens = []
    for word in word_tokens:
        if word.lower() not in stop_words and word.isalnum():
            # Lemmatize the words
            lemmatized_word = lemmatizer.lemmatize(word.lower())
            processed_tokens.append(lemmatized_word)
            # Update global frequency distribution
            global_freq_dist[lemmatized_word] += 1
    return " ".join(processed_tokens)

def process_line(line):
    """Load JSON data, extract review text and sentiment based on rating."""
    json_data = json.loads(line)
    text = json_data.get('reviewText', '')
    overall = json_data.get('overall', 0)
    sentiment = 1 if overall > 3 else 0
    return preprocess_text(text), sentiment

# Initialize the vectorizer and the classifier
vectorizer = HashingVectorizer(stop_words='english', n_features=2**18, alternate_sign=False)
classifier = SGDClassifier(loss='log_loss')

# Path to your JSON file
file_path = '/Users/aqibullah/Downloads/15000.json'
chunk_size = 1000
X_chunk, y_chunk = [], []

with open(file_path, 'r', encoding='utf-8') as file:
    for i, line in enumerate(file):
        X_line, y_line = process_line(line)
        X_chunk.append(X_line)
        y_chunk.append(y_line)
        
        if len(X_chunk) == chunk_size:
            X_vectorized = vectorizer.transform(X_chunk)
            classifier.partial_fit(X_vectorized, y_chunk, classes=[0, 1])
            X_chunk, y_chunk = [], []  # Clear the chunk

        if i % 10000 == 0:
            print(f"Processed {i} lines.")

# Process the last chunk
if X_chunk:
    X_vectorized = vectorizer.transform(X_chunk)
    classifier.partial_fit(X_vectorized, y_chunk, classes=[0, 1])

# Save the classifier to disk
joblib.dump(classifier, 'sentiment_classifier.pkl')

# Print top 10 most frequent words after processing
print("Top 10 most frequent words:", global_freq_dist.most_common(10))

[nltk_data] Downloading package punkt to /Users/aqibullah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aqibullah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aqibullah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/aqibullah/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Processed 0 lines.
Processed 10000 lines.
Top 10 most frequent words: [('book', 31725), ('read', 12543), ('story', 11566), ('one', 11045), ('character', 9606), ('time', 6843), ('like', 6234), ('novel', 5472), ('would', 5225), ('life', 5015)]
