<a href="https://colab.research.google.com/github/BuczynskiRafal/sentiment_analysis/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import bibliotek


In [2]:
import numpy as np
import pandas as pd
import plotly.express as px
import sklearn

np.random.seed(42)
np.set_printoptions(precision=6, suppress=True, edgeitems=10, linewidth=1000, formatter=dict(float=lambda x: f'{x:.2f}'))
sklearn.__version__

'1.0.2'

# Wygenerowanie danych

In [3]:
documents = [
    'Today is Friday',
    'I like Friday',
    'Today I am going to learn Python.',
    'Friday, Friday!!!'
]

print(documents)

['Today is Friday', 'I like Friday', 'Today I am going to learn Python.', 'Friday, Friday!!!']


# Wektoryzacja tekstu

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit_transform(documents)

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [5]:
vectorizer.fit_transform(documents).toarray()

array([[0, 1, 0, 1, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 1, 0, 0, 0],
       [1, 0, 1, 0, 1, 0, 1, 1, 1],
       [0, 2, 0, 0, 0, 0, 0, 0, 0]])

In [6]:
vectorizer.get_feature_names()



['am', 'friday', 'going', 'is', 'learn', 'like', 'python', 'to', 'today']

In [7]:
df = pd.DataFrame(data=vectorizer.fit_transform(documents).toarray(),
                  columns=vectorizer.get_feature_names())
df



Unnamed: 0,am,friday,going,is,learn,like,python,to,today
0,0,1,0,1,0,0,0,0,1
1,0,1,0,0,0,1,0,0,0
2,1,0,1,0,1,0,1,1,1
3,0,2,0,0,0,0,0,0,0


In [8]:
vectorizer.vocabulary_

{'am': 0,
 'friday': 1,
 'going': 2,
 'is': 3,
 'learn': 4,
 'like': 5,
 'python': 6,
 'to': 7,
 'today': 8}

In [9]:
# powyżej pokzane kodowanie
# chcąc zakodować inny tekst można użyć metody transform()
# trzeba jednak zaznaczyć że model jest dopasowany do innych danych i nazwet nie zakoduje słoa którego nie widział
vectorizer.transform(['Friday morning']).toarray()


array([[0, 1, 0, 0, 0, 0, 0, 0, 0]])

# Wektoryzacja tekstu - bigramy - sens zdania

In [10]:
bigram = CountVectorizer(ngram_range=(1, 2), min_df=1)
bigram.fit_transform(documents).toarray()

array([[0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0],
       [0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [11]:
bigram.vocabulary_

{'am': 0,
 'am going': 1,
 'friday': 2,
 'friday friday': 3,
 'going': 4,
 'going to': 5,
 'is': 6,
 'is friday': 7,
 'learn': 8,
 'learn python': 9,
 'like': 10,
 'like friday': 11,
 'python': 12,
 'to': 13,
 'to learn': 14,
 'today': 15,
 'today am': 16,
 'today is': 17}

In [12]:
df = pd.DataFrame(data=bigram.fit_transform(documents).toarray(),
                  columns=bigram.vocabulary_)
df

Unnamed: 0,today,is,friday,today is,is friday,like,like friday,am,going,to,learn,python,today am,am going,going to,to learn,learn python,friday friday
0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,1
1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
2,1,1,0,0,1,1,0,0,1,1,0,0,1,1,1,1,1,0
3,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# TFIDF Transformer - metoda obliczania ważności słów w dokumencie oparta o częstość ich wystąpień

In [13]:
documents = [
    'Friday morning',
    'Friday chill',
    'Friday - morning',
    'Friday, Friday morning!!!'
]

print(documents)

['Friday morning', 'Friday chill', 'Friday - morning', 'Friday, Friday morning!!!']


In [14]:
counts = vectorizer.fit_transform(documents).toarray()
counts

array([[0, 1, 1],
       [1, 1, 0],
       [0, 1, 1],
       [0, 2, 1]])

In [18]:
df = pd.DataFrame(data=vectorizer.fit_transform(documents).toarray(),
                  columns=vectorizer.get_feature_names())
df



Unnamed: 0,chill,friday,morning
0,0,1,1
1,1,1,0
2,0,1,1
3,0,2,1


In [19]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer()

# Ważności słów w dokumencie
tfidf.fit_transform(counts).toarray()

array([[0.00, 0.63, 0.77],
       [0.89, 0.46, 0.00],
       [0.00, 0.63, 0.77],
       [0.00, 0.85, 0.52]])

# TFIDF Vectorizer

In [22]:
# szybszay sposób wektoryzacji danych 
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vector = TfidfVectorizer()
tfidf_vector.fit_transform(documents).toarray()

array([[0.00, 0.63, 0.77],
       [0.89, 0.46, 0.00],
       [0.00, 0.63, 0.77],
       [0.00, 0.85, 0.52]])

In [24]:
# ważność słów
tfidf_vector.idf_

array([1.92, 1.00, 1.22])

In [25]:
from sklearn.datasets import fetch_20newsgroups

raw_data = fetch_20newsgroups(subset='train', categories=['comp.graphics'], random_state=42)
raw_data.keys()


dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [30]:
all_data = raw_data.copy()
all_data['data'][:5]

["From: bbs.mirage@tsoft.net (Jerry Lee)\nSubject: Cobra 2.0 1-b-1 Video card HELP ME!!!!\nOrganization: The TSoft BBS and Public Access Unix, +1 415 969 8238\nLines: 22\n\nDoes ANYONE out there in Net-land have any information on the Cobra 2.20 \ncard?  The sticker on the end of the card reads\n        Model: Cobra 1-B-1\n        Bios:  Cobra v2.20\n\nI Havn't been able to find anything about it from anyone!  If you have \nany information on how to get a hold of the company which produces the \ncard or know where any drivers are for it, PLEASE let me know!\n\nAs far as I can tell, it's a CGA card that is taking up 2 of my 16-bit \nISA slots but when I enable the test patterns, it displays much more than \nthe usualy 4 CGA colors... At least 16 from what I can count.. Thanks!\n\n              .------------------------------------------.\n              : Internet: jele@eis.calstate.edu          :\n              :           bbs.mirage@gilligan.tsoft.net  :\n              :           bbs.

In [31]:
# klasy
all_data['target_names']

['comp.graphics']

In [34]:
all_data['target'][:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [37]:
tfidf = TfidfVectorizer()
tfidf.fit_transform(all_data['data']).toarray()

array([[0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ..., 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
       [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ..., 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.04, 0.00, 0.00],
       [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ..., 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
       [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ..., 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
       [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ..., 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
       [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ..., 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
       [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, ..., 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
       [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 