In [4]:
import pandas as pd
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Assuming NLTK libraries are already installed and imported

def preprocess(text):
    # Remove special characters, numbers, punctuation
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\d+', '', text)
    
    # Tokenization
    tokens = word_tokenize(text.lower())
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Reconstruct the processed text
    processed_text = ' '.join(tokens)
    
    return processed_text

# Load train and test datasets
train_data = pd.read_csv(r'C:\Users\larin\Desktop\PUC\IA\Listas\Lista 08\Resolução\ReutersGrain-train.csv', delimiter=';', encoding='utf-8')
test_data = pd.read_csv(r'C:\Users\larin\Desktop\PUC\IA\Listas\Lista 08\Resolução\ReutersGrain-test.csv', delimiter=';', encoding='utf-8')

# Apply preprocessing to train and test datasets
train_data['ProcessedText'] = train_data['Text'].apply(preprocess)
test_data['ProcessedText'] = test_data['Text'].apply(preprocess)

# Check the processed data
print(train_data.head())
print(test_data.head())

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)  # Example with max 1000 features

# Fit and transform on train set
X_train = vectorizer.fit_transform(train_data['ProcessedText']).toarray()
y_train = train_data['class-att']

# Transform test set (only transform using fitted vectorizer)
X_test = vectorizer.transform(test_data['ProcessedText']).toarray()
y_test = test_data['class-att']

# Initialize models
logreg_model = LogisticRegression()
rf_model = RandomForestClassifier()

# Train models
logreg_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

# Predictions
logreg_preds = logreg_model.predict(X_test)
rf_preds = rf_model.predict(X_test)

# Evaluate accuracy
logreg_accuracy = accuracy_score(y_test, logreg_preds)
rf_accuracy = accuracy_score(y_test, rf_preds)

print(f'\nLogistic Regression Accuracy: {logreg_accuracy}')
print(f'\nRandom Forest Accuracy: {rf_accuracy}')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\larin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\larin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


                                                Text  class-att  \
0  'BAHIA COCOA REVIEW Showers continued througho...          0   
1  'NATIONAL AVERAGE PRICES FOR FARMER-OWNED RESE...          1   
2  'ARGENTINE 1986/87 GRAIN/OILSEED REGISTRATIONS...          1   
3  'CHAMPION PRODUCTS &lt.CH> APPROVES STOCK SPLI...          0   
4  'COMPUTER TERMINAL SYSTEMS &lt.CPML> COMPLETES...          0   

                                       ProcessedText  
0  bahia cocoa review shower continu throughout w...  
1  nation averag price farmer own reserv u agricu...  
2  argentin grain oilse registr argentin grain bo...  
3  champion product lt ch approv stock split cham...  
4  comput termin system lt cpml complet sale comp...  
                                                Text  class-att  \
0  'ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN R...          0   
1  'CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN ST...          1   
2  'JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNW...          0   